Numpy¶

In [1]:
import numpy as np
In [2]:
l=[1,2,3,4,5]
In [6]:
#convert to array
arr=np.array(l)
In [7]:
type(arr)
Out[7]:
numpy.ndarray
In [8]:
np.asarray(l)
Out[8]:
array([1, 2, 3, 4, 5])
In [12]:
arr1=np.array([[1,2,3],[2,3,4]])
In [13]:
arr.ndim
Out[13]:
1
In [14]:
arr1.ndim
Out[14]:
2
In [16]:
mat=np.matrix(l)
In [17]:
mat
Out[17]:
matrix([[1, 2, 3, 4, 5]])
In [18]:
a=arr
In [19]:
a
Out[19]:
array([1, 2, 3, 4, 5])
In [20]:
arr[0]
Out[20]:
1
In [21]:
arr[0]=100
In [22]:
a
Out[22]:
array([100,   2,   3,   4,   5])
In [23]:
b=np.copy(arr)
In [24]:
b
Out[24]:
array([100,   2,   3,   4,   5])
In [25]:
b[0]=234
In [26]:
b
Out[26]:
array([234,   2,   3,   4,   5])
In [27]:
arr
Out[27]:
array([100,   2,   3,   4,   5])
In [29]:
list(i*i for i in  range(5)) 
Out[29]:
[0, 1, 4, 9, 16]
In [31]:
np.fromstring('23 56 76',sep=' ')
Out[31]:
array([23., 56., 76.])
In [32]:
arr.size
Out[32]:
5
In [33]:
arr1.size
Out[33]:
6
In [34]:
arr.shape
Out[34]:
(5,)
In [35]:
import numpy as np
In [36]:
list(range(5))
Out[36]:
[0, 1, 2, 3, 4]
In [37]:
np.arange(.4,10.4,0.2)
Out[37]:
array([ 0.4,  0.6,  0.8,  1. ,  1.2,  1.4,  1.6,  1.8,  2. ,  2.2,  2.4,
        2.6,  2.8,  3. ,  3.2,  3.4,  3.6,  3.8,  4. ,  4.2,  4.4,  4.6,
        4.8,  5. ,  5.2,  5.4,  5.6,  5.8,  6. ,  6.2,  6.4,  6.6,  6.8,
        7. ,  7.2,  7.4,  7.6,  7.8,  8. ,  8.2,  8.4,  8.6,  8.8,  9. ,
        9.2,  9.4,  9.6,  9.8, 10. , 10.2])
In [38]:
 np.zeros((3,4))
Out[38]:
array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])
In [39]:
 np.zeros((3,4,2)) #dimention,row,column
Out[39]:
array([[[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]])
In [40]:
np.ones(5)
Out[40]:
array([1., 1., 1., 1., 1.])
In [41]:
np.ones((3,4))
Out[41]:
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])
In [42]:
np.random.rand(2,3)
Out[42]:
array([[0.88744277, 0.31751246, 0.56421228],
       [0.27202733, 0.34274517, 0.31403072]])
In [44]:
arr2=np.random.randint(1,5,(3,4))
In [45]:
arr2
Out[45]:
array([[3, 4, 4, 2],
       [1, 1, 3, 4],
       [1, 2, 3, 2]])
In [46]:
arr2.size
Out[46]:
12
In [48]:
arr2.reshape(4,3)
Out[48]:
array([[3, 4, 4],
       [2, 1, 1],
       [3, 4, 1],
       [2, 3, 2]])
In [49]:
arr2>2
Out[49]:
array([[ True,  True,  True, False],
       [False, False,  True,  True],
       [False, False,  True, False]])
In [51]:
arr1[0]
Out[51]:
array([1, 2, 3])
In [52]:
arr2
Out[52]:
array([[3, 4, 4, 2],
       [1, 1, 3, 4],
       [1, 2, 3, 2]])
In [53]:
arr2[2:4,[2,3]]
Out[53]:
array([[3, 2]])
In [54]:
arr1@arr2 #matrix multiplication
Out[54]:
array([[ 8, 12, 19, 16],
       [13, 19, 29, 24]])
In [58]:
arr3=np.random.randint(1,10,(4,4))
In [59]:
arr3
Out[59]:
array([[2, 8, 8, 4],
       [8, 4, 5, 2],
       [9, 1, 3, 6],
       [9, 8, 9, 9]])
In [60]:
arr3.T
Out[60]:
array([[2, 8, 9, 9],
       [8, 4, 1, 8],
       [8, 5, 3, 9],
       [4, 2, 6, 9]])
In [61]:
np.repeat(data)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [61], in <cell line: 1>()
----> 1 np.repeat(data)

NameError: name 'data' is not defined
In [70]:
data=np.random.randint(3,6,(1,4))
In [71]:
data
Out[71]:
array([[4, 3, 4, 3]])
In [72]:
np.repeat(data,4) #it repeat 4 times
Out[72]:
array([4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3])
In [73]:
np.diag(np.array([1,2,3,4])) #diagonal matrix
Out[73]:
array([[1, 0, 0, 0],
       [0, 2, 0, 0],
       [0, 0, 3, 0],
       [0, 0, 0, 4]])
In [74]:
arr1=np.random.randint(1,10,(3,4))
arr2=np.random.randint(1,10,(3,4))
In [75]:
arr1
Out[75]:
array([[7, 7, 6, 3],
       [3, 5, 7, 5],
       [8, 1, 6, 2]])
In [76]:
arr2
Out[76]:
array([[8, 2, 9, 7],
       [3, 8, 3, 6],
       [6, 3, 7, 9]])
In [77]:
arr1>arr2
Out[77]:
array([[False,  True, False, False],
       [False, False,  True, False],
       [ True, False, False, False]])

string operion in array¶

In [78]:
arr=np.array(['sudh','kumar'])
In [79]:
arr
Out[79]:
array(['sudh', 'kumar'], dtype='<U5')
In [80]:
#convert to upper case
np.char.upper(arr)
Out[80]:
array(['SUDH', 'KUMAR'], dtype='<U5')
In [81]:
np.char.capitalize(arr)
Out[81]:
array(['Sudh', 'Kumar'], dtype='<U5')
In [82]:
arr1
Out[82]:
array([[7, 7, 6, 3],
       [3, 5, 7, 5],
       [8, 1, 6, 2]])
In [83]:
np.sin(arr1)
Out[83]:
array([[ 0.6569866 ,  0.6569866 , -0.2794155 ,  0.14112001],
       [ 0.14112001, -0.95892427,  0.6569866 , -0.95892427],
       [ 0.98935825,  0.84147098, -0.2794155 ,  0.90929743]])
In [84]:
np.tan(arr1)
Out[84]:
array([[ 0.87144798,  0.87144798, -0.29100619, -0.14254654],
       [-0.14254654, -3.38051501,  0.87144798, -3.38051501],
       [-6.79971146,  1.55740772, -0.29100619, -2.18503986]])
In [85]:
np.exp(arr1) #exponentional
Out[85]:
array([[1.09663316e+03, 1.09663316e+03, 4.03428793e+02, 2.00855369e+01],
       [2.00855369e+01, 1.48413159e+02, 1.09663316e+03, 1.48413159e+02],
       [2.98095799e+03, 2.71828183e+00, 4.03428793e+02, 7.38905610e+00]])
In [86]:
np.mean(arr1)
Out[86]:
5.0
In [87]:
np.median(arr1)
Out[87]:
5.5
In [89]:
np.std(arr1) #standard division
Out[89]:
2.160246899469287
In [90]:
np.var(arr1)
Out[90]:
4.666666666666667
In [91]:
np.max(arr1)
Out[91]:
8
In [92]:
np.min(arr1)
Out[92]:
1
In [93]:
np.multiply(arr1,arr2)
Out[93]:
array([[56, 14, 54, 21],
       [ 9, 40, 21, 30],
       [48,  3, 42, 18]])
In [95]:
np.subtract(arr1,arr2)
Out[95]:
array([[-1,  5, -3, -4],
       [ 0, -3,  4, -1],
       [ 2, -2, -1, -7]])
In [97]:
np.mod(arr1,arr2)
Out[97]:
array([[7, 1, 6, 3],
       [0, 5, 1, 5],
       [2, 1, 6, 2]], dtype=int32)
In [108]:
arr=np.array([2,3,0,9,5,0,6])
In [109]:
np.sort(arr)
Out[109]:
array([0, 0, 2, 3, 5, 6, 9])
In [110]:
np.count_nonzero(arr)
Out[110]:
5
In [111]:
np.where(arr>0)
Out[111]:
(array([0, 1, 3, 4, 6], dtype=int64),)
In [112]:
np.extract(arr>2,arr1)
Out[112]:
array([7, 3, 3, 7])

matplotlib¶

In [141]:
import matplotlib.pyplot as plt
In [142]:
import numpy as np
In [115]:
x=np.random.rand(50)
y=np.random.rand(50)
In [116]:
x
Out[116]:
array([0.20390335, 0.06334784, 0.21002773, 0.20997411, 0.81657932,
       0.80534733, 0.07292785, 0.46606985, 0.09790799, 0.1854319 ,
       0.77042002, 0.19494667, 0.10106707, 0.75044698, 0.33694849,
       0.38205424, 0.9060851 , 0.7114721 , 0.39069017, 0.45077087,
       0.70085369, 0.45179714, 0.68159053, 0.48475405, 0.90123781,
       0.12672423, 0.81312689, 0.12390088, 0.83231107, 0.72636327,
       0.75576696, 0.6091184 , 0.4568806 , 0.50639269, 0.62788258,
       0.96336562, 0.14599948, 0.34843149, 0.71688237, 0.88023386,
       0.68166321, 0.74488392, 0.06832264, 0.64024834, 0.75502204,
       0.90163984, 0.47112274, 0.32671817, 0.8732572 , 0.60609986])
In [117]:
y
Out[117]:
array([0.67058628, 0.4106971 , 0.20099331, 0.10702518, 0.06435525,
       0.12069608, 0.04153432, 0.55397627, 0.56690531, 0.79014101,
       0.18397709, 0.22950558, 0.00333029, 0.08779434, 0.64317556,
       0.56996028, 0.80141139, 0.64052987, 0.31799524, 0.52277043,
       0.54365779, 0.91630439, 0.75594956, 0.36419699, 0.27966255,
       0.57334445, 0.90302377, 0.96860598, 0.28965098, 0.84979023,
       0.97779768, 0.60484796, 0.88357525, 0.14307741, 0.53958289,
       0.66857452, 0.75347336, 0.09605838, 0.163594  , 0.81079885,
       0.96818755, 0.73066672, 0.79780539, 0.22988367, 0.69320177,
       0.54892592, 0.52539475, 0.69227404, 0.0391454 , 0.82968987])
In [118]:
plt.scatter(x,y) #scatter map
Out[118]:
<matplotlib.collections.PathCollection at 0x2263e091700>
In [125]:
plt.figure(figsize=(6,4))
plt.scatter(x,y,c='r')
plt.xlabel("this is x axis")
plt.ylabel("this is y axis")
plt.title("this is x vs y")
plt.grid()
In [126]:
plt.plot(x,y)
Out[126]:
[<matplotlib.lines.Line2D at 0x2263fa48520>]
In [128]:
x=np.linspace(1,10,100)
y=np.sin(x)
In [129]:
x
Out[129]:
array([ 1.        ,  1.09090909,  1.18181818,  1.27272727,  1.36363636,
        1.45454545,  1.54545455,  1.63636364,  1.72727273,  1.81818182,
        1.90909091,  2.        ,  2.09090909,  2.18181818,  2.27272727,
        2.36363636,  2.45454545,  2.54545455,  2.63636364,  2.72727273,
        2.81818182,  2.90909091,  3.        ,  3.09090909,  3.18181818,
        3.27272727,  3.36363636,  3.45454545,  3.54545455,  3.63636364,
        3.72727273,  3.81818182,  3.90909091,  4.        ,  4.09090909,
        4.18181818,  4.27272727,  4.36363636,  4.45454545,  4.54545455,
        4.63636364,  4.72727273,  4.81818182,  4.90909091,  5.        ,
        5.09090909,  5.18181818,  5.27272727,  5.36363636,  5.45454545,
        5.54545455,  5.63636364,  5.72727273,  5.81818182,  5.90909091,
        6.        ,  6.09090909,  6.18181818,  6.27272727,  6.36363636,
        6.45454545,  6.54545455,  6.63636364,  6.72727273,  6.81818182,
        6.90909091,  7.        ,  7.09090909,  7.18181818,  7.27272727,
        7.36363636,  7.45454545,  7.54545455,  7.63636364,  7.72727273,
        7.81818182,  7.90909091,  8.        ,  8.09090909,  8.18181818,
        8.27272727,  8.36363636,  8.45454545,  8.54545455,  8.63636364,
        8.72727273,  8.81818182,  8.90909091,  9.        ,  9.09090909,
        9.18181818,  9.27272727,  9.36363636,  9.45454545,  9.54545455,
        9.63636364,  9.72727273,  9.81818182,  9.90909091, 10.        ])
In [130]:
y
Out[130]:
array([ 0.84147098,  0.88704699,  0.92529707,  0.95590534,  0.978619  ,
        0.99325047,  0.99967891,  0.99785123,  0.98778253,  0.96955595,
        0.94332203,  0.90929743,  0.86776314,  0.8190622 ,  0.76359681,
        0.70182505,  0.63425707,  0.56145091,  0.48400786,  0.40256749,
        0.31780241,  0.23041267,  0.14112001,  0.05066187, -0.04021468,
       -0.1307591 , -0.22022362, -0.30786935, -0.39297247, -0.47483011,
       -0.55276624, -0.6261372 , -0.69433703, -0.7568025 , -0.8130177 ,
       -0.86251837, -0.9048957 , -0.93979971, -0.96694212, -0.98609877,
       -0.99711147, -0.99988924, -0.99440916, -0.98071647, -0.95892427,
       -0.92921254, -0.89182665, -0.84707537, -0.79532828, -0.73701276,
       -0.67261042, -0.60265314, -0.52771868, -0.44842592, -0.36542971,
       -0.2794155 , -0.19109366, -0.10119362, -0.01045784,  0.0803643 ,
        0.17052273,  0.25927286,  0.34588171,  0.429634  ,  0.50983804,
        0.58583144,  0.6569866 ,  0.72271585,  0.78247636,  0.83577457,
        0.88217031,  0.92128041,  0.95278186,  0.9764145 ,  0.99198316,
        0.99935926,  0.99848187,  0.98935825,  0.97206374,  0.94674118,
        0.9135997 ,  0.87291301,  0.82501713,  0.77030762,  0.70923631,
        0.64230758,  0.57007418,  0.49313267,  0.41211849,  0.32770071,
        0.24057653,  0.15146548,  0.06110351, -0.0297631 , -0.1203839 ,
       -0.21001048, -0.29790263, -0.38333447, -0.46560043, -0.54402111])
In [131]:
plt.plot(x,y)
Out[131]:
[<matplotlib.lines.Line2D at 0x2263fae3910>]
In [132]:
x=['a','b','c','d','e']
y=np.random.rand(5)
In [133]:
x
Out[133]:
['a', 'b', 'c', 'd', 'e']
In [134]:
y
Out[134]:
array([0.26997769, 0.43015399, 0.93267056, 0.68002515, 0.72440375])
In [136]:
plt.bar(x,y)
plt.xlabel("representing my catagorical")
plt.ylabel("representing my num vakues")
plt.title("bar plot")
plt.figure(figsize=(5,3))
Out[136]:
<Figure size 360x216 with 0 Axes>
<Figure size 360x216 with 0 Axes>
In [137]:
data=[1,1,1,5,5,8,7,8,8,9,9,0,4,2,3]
In [138]:
plt.hist(data) #it shows the frequency 1 repeated 3times
Out[138]:
(array([1., 3., 1., 1., 1., 2., 0., 1., 3., 2.]),
 array([0. , 0.9, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9. ]),
 <BarContainer object of 10 artists>)
In [145]:
x=np.random.rand(50)
y=np.random.rand(50)
z=np.random.rand(50)
fig=plt.figure()
ax=fig.add_subplot(projection='3d')
ax.scatter(x,y,z)
plt.show()

seaborn¶

In [1]:
import seaborn as sns
In [2]:
iris=sns.load_dataset('iris')
In [3]:
iris
Out[3]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

150 rows × 5 columns

In [5]:
sns.scatterplot(x=iris.sepal_length,y=iris.sepal_width)
Out[5]:
<AxesSubplot:xlabel='sepal_length', ylabel='sepal_width'>
In [7]:
tips=sns.load_dataset('tips')
In [8]:
tips
Out[8]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [8]:
sns.scatterplot(x=tips.total_bill,y=tips.tip)
Out[8]:
<AxesSubplot:xlabel='total_bill', ylabel='tip'>
In [9]:
tips.head()
Out[9]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [10]:
tips['smoker'].value_counts()
Out[10]:
No     151
Yes     93
Name: smoker, dtype: int64
In [13]:
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,hue="smoker")
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x121d4f01c70>
In [10]:
#how are smoker and not smoker
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="smoker")
Out[10]:
<seaborn.axisgrid.FacetGrid at 0x22f50b0cb50>
In [11]:
#
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="size")
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x22f517c9e20>
In [12]:
#rhe person came for lunch or dinner
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="size",hue='time')
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x22f517c91c0>
In [13]:
#how many people are coming to restarent daily
sns.catplot(x='day',y='total_bill',data=tips)
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x22f5186c4f0>
In [14]:
sns.jointplot(x=tips.total_bill,y=tips.tip)
Out[14]:
<seaborn.axisgrid.JointGrid at 0x22f5294f040>
In [15]:
sns.pairplot(tips)
Out[15]:
<seaborn.axisgrid.PairGrid at 0x22f517ad6a0>

Measure Of Central Tendency¶

In [1]:
age=[12,13,14,15,21,24]
In [2]:
(12+13+14+15+21+24)/6
Out[2]:
16.5
In [3]:
import numpy as np
In [4]:
#mean
np.mean(age)
Out[4]:
16.5
In [5]:
#median
np.median(age)
Out[5]:
14.5
In [8]:
#mode
from scipy import stats
In [9]:
stats.mode(age)
Out[9]:
ModeResult(mode=array([12]), count=array([1]))

Measure of Dispersion¶

In [10]:
ages_lst=[23,24,34,34,23,25,65,75,32]
In [12]:
import numpy as np
In [14]:
mean=np.mean(ages_lst)
In [15]:
mean
Out[15]:
37.22222222222222
In [16]:
var=np.var(ages_lst)
In [17]:
var
Out[17]:
330.61728395061726
In [18]:
std=np.std(ages_lst)
In [19]:
std
Out[19]:
18.182884368290342
In [20]:
data=[[10,12,13],[34,23,65],[32,33,21]]
In [21]:
data
Out[21]:
[[10, 12, 13], [34, 23, 65], [32, 33, 21]]
In [23]:
import pandas as pd
In [26]:
df=pd.DataFrame(data,columns=["A","B","C"])
In [27]:
df
Out[27]:
A B C
0 10 12 13
1 34 23 65
2 32 33 21
In [29]:
#Row wise
df.var(axis=1)
Out[29]:
0      2.333333
1    474.333333
2     44.333333
dtype: float64
In [30]:
#column wise
df.var(axis=0)
Out[30]:
A    177.333333
B    110.333333
C    784.000000
dtype: float64
In [31]:
import seaborn as sns
In [34]:
df=sns.load_dataset('healthexp')
df.head()
Out[34]:
Year Country Spending_USD Life_Expectancy
0 1970 Germany 252.311 70.6
1 1970 France 192.143 72.2
2 1970 Great Britain 123.993 71.9
3 1970 Japan 150.437 72.0
4 1970 USA 326.961 70.9
In [35]:
import numpy as np
In [36]:
df.cov() #covarience
Out[36]:
Year Spending_USD Life_Expectancy
Year 201.098848 2.571883e+04 41.915454
Spending_USD 25718.827373 4.817761e+06 4166.800912
Life_Expectancy 41.915454 4.166801e+03 10.733902
In [37]:
#correlation
df.corr(method='spearman')
Out[37]:
Year Spending_USD Life_Expectancy
Year 1.000000 0.931598 0.896117
Spending_USD 0.931598 1.000000 0.747407
Life_Expectancy 0.896117 0.747407 1.000000
In [38]:
df=sns.load_dataset('penguins')
df.head()
Out[38]:
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
In [39]:
df.corr()
Out[39]:
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
bill_length_mm 1.000000 -0.235053 0.656181 0.595110
bill_depth_mm -0.235053 1.000000 -0.583851 -0.471916
flipper_length_mm 0.656181 -0.583851 1.000000 0.871202
body_mass_g 0.595110 -0.471916 0.871202 1.000000

Check Normal Distubution using QQ plot-Quantile-Quantile plot¶

In [40]:
import scipy.stats as stat
import pylab
import numpy as np
In [41]:
import seaborn as sns
In [42]:
df=sns.load_dataset("iris")
df.head()
Out[42]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [47]:
import matplotlib.pyplot as plt
In [53]:
def plot_data(df,feature):
    plt.figure(figsize=(10,6))
    plt.subplot(1,2,1)
    sns.histplot(df[feature],kde=True)
    plt.subplot(1,2,2)
    stat.probplot(df[feature],dist='norm',plot=pylab)
    plt.show()
In [54]:
plot_data(df,'sepal_length')
In [55]:
plot_data(df,'sepal_width')
In [56]:
plot_data(df,'petal_length')
In [ ]:
Chi square test
In [1]:
import scipy.stats as stat
import numpy as np
In [2]:
#No of hours student study daily in a weekliy basis
#mond,tus,wend,thus,friday,saturday,sunday
expected_data=[8,6,7,9,6,9,7]
observed_data=[7,8,6,9,9,6,7]
In [4]:
sum(expected_data),sum(observed_data)
Out[4]:
(52, 52)
In [5]:
#chi square goodness of fit
chisquare_test_statistics,p_value=stat.chisquare(observed_data,expected_data)
In [6]:
print(chisquare_test_statistics),print(p_value)
3.4345238095238093
0.7526596580922865
Out[6]:
(None, None)
In [8]:
#find the critical value
significance=0.05
dof=len(observed_data)-1
critical_value=stat.chi2.ppf(1-significance,dof)
In [9]:
critical_value
Out[9]:
12.591587243743977
In [10]:
if chisquare_test_statistics > critical_value:
    print("reject the null hypothesis")
else:
    print("accept the null hypothesis")
accept the null hypothesis

Missing values¶

In [1]:
import seaborn as sns
In [2]:
df=sns.load_dataset('titanic')
In [3]:
df.head()
Out[3]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [4]:
#check missing values
df.isnull()
Out[4]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 False False False False False False False False False False False True False False False
1 False False False False False False False False False False False False False False False
2 False False False False False False False False False False False True False False False
3 False False False False False False False False False False False False False False False
4 False False False False False False False False False False False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False False True False False False
887 False False False False False False False False False False False False False False False
888 False False False True False False False False False False False True False False False
889 False False False False False False False False False False False False False False False
890 False False False False False False False False False False False True False False False

891 rows × 15 columns

In [5]:
df.isnull().sum() #in which column how many null values are there
Out[5]:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
In [6]:
#delete the rows or data points to handle missing values
df.shape
Out[6]:
(891, 15)
In [ ]:
 
In [7]:
df.dropna().shape #it drop all nan values
Out[7]:
(182, 15)
In [8]:
##columns wise delete
df.dropna(axis=1)
Out[8]:
survived pclass sex sibsp parch fare class who adult_male alive alone
0 0 3 male 1 0 7.2500 Third man True no False
1 1 1 female 1 0 71.2833 First woman False yes False
2 1 3 female 0 0 7.9250 Third woman False yes True
3 1 1 female 1 0 53.1000 First woman False yes False
4 0 3 male 0 0 8.0500 Third man True no True
... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 0 0 13.0000 Second man True no True
887 1 1 female 0 0 30.0000 First woman False yes True
888 0 3 female 1 2 23.4500 Third woman False no False
889 1 1 male 0 0 30.0000 First man True yes True
890 0 3 male 0 0 7.7500 Third man True no True

891 rows × 11 columns

imputation missing values¶

1-Mean Value Imputation

In [9]:
sns.histplot(df['age'],kde=True)
Out[9]:
<AxesSubplot:xlabel='age', ylabel='Count'>
In [10]:
df["age_mean"]=df['age'].fillna(df['age'].mean()) #it replace all the NSAN values with mean of the age
In [ ]:
df["age_mean"]=df['age'].fillna(df['age'].mean())
In [11]:
df['age_mean']=df['age'].fillna(df['age'].mean())
In [13]:
df['age_median']=df['age'].fillna(df['age'].median())
In [15]:
df["age_median"]
Out[15]:
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age_median, Length: 891, dtype: float64
In [12]:
df[["age_mean",'age']]
Out[12]:
age_mean age
0 22.000000 22.0
1 38.000000 38.0
2 26.000000 26.0
3 35.000000 35.0
4 35.000000 35.0
... ... ...
886 27.000000 27.0
887 19.000000 19.0
888 29.699118 NaN
889 26.000000 26.0
890 32.000000 32.0

891 rows × 2 columns

2. Median value Imputation.If we have outliers in the dataset¶

In [18]:
df['age_median']=df['age'].fillna(df['age'].median())
In [ ]:
df['age_median']=df['age'].fillna(df['age'].median())
In [ ]:
 
In [20]:
df[['age_median','age']]
Out[20]:
age_median age
0 22.0 22.0
1 38.0 38.0
2 26.0 26.0
3 35.0 35.0
4 35.0 35.0
... ... ...
886 27.0 27.0
887 19.0 19.0
888 28.0 NaN
889 26.0 26.0
890 32.0 32.0

891 rows × 2 columns

Mode Imputation Technique--Categorical values¶

In [21]:
df[df['embarked'].isnull()]
Out[21]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone age_median age_mean
61 1 1 female 38.0 0 0 80.0 NaN First woman False B NaN yes True 38.0 38.0
829 1 1 female 62.0 0 0 80.0 NaN First woman False B NaN yes True 62.0 62.0
In [22]:
df['embarked'].unique()
Out[22]:
array(['S', 'C', 'Q', nan], dtype=object)
In [ ]:
mode_values=df[df['embarked'].notna()]['embarked'].mode()[0]
In [ ]:
df['median_age']=df['age'].fillna(df['age'].median())
In [16]:
df['embarked'].unique()
Out[16]:
array(['S', 'C', 'Q', nan], dtype=object)
In [ ]:
df[df['embarked'].notna()]['embarked'].mode()[0]
In [28]:
mode_value=df[df['embarked'].notna()]['embarked'].mode()[0]
In [ ]:
df[df['embarked'].notna()]#it don't gives NAN values so we can do mofe
['embarked'].mode()[0] #this is for doing mode 
In [29]:
df["embarked_mode"]=df['embarked'].fillna(mode_value)
In [30]:
df[['embarked_mode','embarked']]
Out[30]:
embarked_mode embarked
0 S S
1 C C
2 S S
3 S S
4 S S
... ... ...
886 S S
887 S S
888 S S
889 C C
890 Q Q

891 rows × 2 columns

In [31]:
df['embarked_mode'].isnull().sum()
Out[31]:
0

Handling Imbalanced Dataset¶

1.Up Sampling¶

2.Down Sampling¶

In [34]:
import numpy as np
import pandas as pd
#set the random seed for reproducinility
np.random.seed(123)
#create a datafram with two classes
n_samples=1000
class_0_ratio=0.9
n_class_0=int(n_samples * class_0_ratio)
n_class_1=n_samples - n_class_0
In [35]:
n_class_0,n_class_1
Out[35]:
(900, 100)

linear interpolation¶

In [1]:
import numpy as np
x=np.array([1,2,3,4,5,])
y=np.array([2,4,6,8,10])
In [2]:
import matplotlib.pyplot as plt
plt.scatter(x,y)
Out[2]:
<matplotlib.collections.PathCollection at 0x29db19f7790>
In [3]:
#interpollate the data using liner interpolation
x_new=np.linspace(1,5,10) #create new x values 1 to 5 ,in between 10 numbers
y_interp=np.interp(x_new,x,y)
In [4]:
plt.scatter(x_new,y_interp)
Out[4]:
<matplotlib.collections.PathCollection at 0x29db1b0ce50>

2.Cubic Interpolation with Scipy¶

In [2]:
import numpy as np
x=np.array([1,2,3,4,5])
y=np.array([1,8,27,64,125])
In [3]:
from scipy.interpolate import interp1d
In [4]:
#create a cubic interpolation function
f=interp1d(x,y,kind='cubic')
In [5]:
#interpolate the data
x_new=np.linspace(1,5,10)
y_interp=f(x_new)
In [6]:
plt.scatter(x,y)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [6], in <cell line: 1>()
----> 1 plt.scatter(x,y)

NameError: name 'plt' is not defined
In [12]:
plt.scatter(x_new,y_interp)
Out[12]:
<matplotlib.collections.PathCollection at 0x29db2cf1e20>

3.Polynomial Interpolation¶

In [13]:
#create some ample data
x=np.array([1,2,3,4,5])
y=np.array([1,4,9,16,25])
In [14]:
#interpolate the data using polynomial interpolation
p=np.polyfit(x,y,2)
In [15]:
x_new=np.linspace(1,5,10) #create new x values
y_interp=np.polyval(p,x_new) #interpolate y valies
In [16]:
plt.scatter(x_new,y_interp)
Out[16]:
<matplotlib.collections.PathCollection at 0x29db3d2d190>

Covariance and coorrelation With Python¶

In [1]:
import seaborn as sns
In [3]:
df=sns.load_dataset('healthexp')
df.head()
Out[3]:
Year Country Spending_USD Life_Expectancy
0 1970 Germany 252.311 70.6
1 1970 France 192.143 72.2
2 1970 Great Britain 123.993 71.9
3 1970 Japan 150.437 72.0
4 1970 USA 326.961 70.9
In [4]:
#covariance 
import numpy as np
In [5]:
df.cov()
Out[5]:
Year Spending_USD Life_Expectancy
Year 201.098848 2.571883e+04 41.915454
Spending_USD 25718.827373 4.817761e+06 4166.800912
Life_Expectancy 41.915454 4.166801e+03 10.733902
In [6]:
##Correlation
df.corr(method='spearman')
Out[6]:
Year Spending_USD Life_Expectancy
Year 1.000000 0.931598 0.896117
Spending_USD 0.931598 1.000000 0.747407
Life_Expectancy 0.896117 0.747407 1.000000
In [7]:
#pearrson Correlation
df.corr(method='pearson')
Out[7]:
Year Spending_USD Life_Expectancy
Year 1.000000 0.826273 0.902175
Spending_USD 0.826273 1.000000 0.579430
Life_Expectancy 0.902175 0.579430 1.000000

Exploratary data analysis¶

In [8]:
import pandas as pd
df=pd.read_csv('winequality-red.csv')
df.head()
Out[8]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [9]:
#summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
In [10]:
#ddescriptive summary of the dataset
df.describe()
Out[10]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806 0.087467 15.874922 46.467792 0.996747 3.311113 0.658149 10.422983 5.636023
std 1.741096 0.179060 0.194801 1.409928 0.047065 10.460157 32.895324 0.001887 0.154386 0.169507 1.065668 0.807569
min 4.600000 0.120000 0.000000 0.900000 0.012000 1.000000 6.000000 0.990070 2.740000 0.330000 8.400000 3.000000
25% 7.100000 0.390000 0.090000 1.900000 0.070000 7.000000 22.000000 0.995600 3.210000 0.550000 9.500000 5.000000
50% 7.900000 0.520000 0.260000 2.200000 0.079000 14.000000 38.000000 0.996750 3.310000 0.620000 10.200000 6.000000
75% 9.200000 0.640000 0.420000 2.600000 0.090000 21.000000 62.000000 0.997835 3.400000 0.730000 11.100000 6.000000
max 15.900000 1.580000 1.000000 15.500000 0.611000 72.000000 289.000000 1.003690 4.010000 2.000000 14.900000 8.000000
In [12]:
df.shape
Out[12]:
(1599, 12)
In [13]:
#list down all the column names
df.columns
Out[13]:
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
In [14]:
df['quality'].unique()
Out[14]:
array([5, 6, 7, 4, 8, 3], dtype=int64)
In [15]:
#missing values in the dataset
df.isnull().sum()
Out[15]:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [17]:
#duplicate records
df[df.duplicated()]
Out[17]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
4 7.4 0.700 0.00 1.90 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
11 7.5 0.500 0.36 6.10 0.071 17.0 102.0 0.99780 3.35 0.80 10.5 5
27 7.9 0.430 0.21 1.60 0.106 10.0 37.0 0.99660 3.17 0.91 9.5 5
40 7.3 0.450 0.36 5.90 0.074 12.0 87.0 0.99780 3.33 0.83 10.5 5
65 7.2 0.725 0.05 4.65 0.086 4.0 11.0 0.99620 3.41 0.39 10.9 5
... ... ... ... ... ... ... ... ... ... ... ... ...
1563 7.2 0.695 0.13 2.00 0.076 12.0 20.0 0.99546 3.29 0.54 10.1 5
1564 7.2 0.695 0.13 2.00 0.076 12.0 20.0 0.99546 3.29 0.54 10.1 5
1567 7.2 0.695 0.13 2.00 0.076 12.0 20.0 0.99546 3.29 0.54 10.1 5
1581 6.2 0.560 0.09 1.70 0.053 24.0 32.0 0.99402 3.54 0.60 11.3 5
1596 6.3 0.510 0.13 2.30 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6

240 rows × 12 columns

In [18]:
##remove the duplicates
df.drop_duplicates(inplace=True) #inplace
In [20]:
df.shape
Out[20]:
(1359, 12)

5 Number Summary And Box Plot¶

In [1]:
## Minimum,Maximum,Median,Q1,Q3,IQR
In [2]:
import numpy as np
In [8]:
lst_marks=[42,32,56,75,89,54,32,89,90,87,67,54,45,98,99,67,74]
minimum,Q1,median,Q3,maximum=np.quantile(lst_marks,[0,0.25,0.50,0.75,1.0])
In [9]:
minimum,Q1,median,Q3,maximum
Out[9]:
(32.0, 54.0, 67.0, 89.0, 99.0)
In [10]:
IQR=Q3-Q1
print(IQR)
35.0
In [11]:
lower_fence=Q1-1.5*(IQR)
higher_fence=Q3+1.5*(IQR)
In [12]:
lower_fence
Out[12]:
1.5
In [13]:
higher_fence
Out[13]:
141.5
In [14]:
import seaborn as sns
In [15]:
sns.boxplot(lst_marks)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[15]:
<AxesSubplot:>
In [7]:
import seaborn as sns
In [8]:
df=sns.load_dataset('tips')
In [18]:
df.head()
Out[18]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [19]:
import numpy as np
mean=np.mean(df['total_bill'])
std=np.std(df['total_bill'])
print(mean,std)
19.785942622950824 8.88415057777113
In [20]:
normalized_data=[]
for i in list(df['total_bill']):
    z_score=(i-mean)/std
    normalized_data.append(z_score)
In [21]:
normalized_data
Out[21]:
[-0.3147113050904947,
 -1.0632353132988699,
 0.13777989987156108,
 0.4383151031672544,
 0.5407447042905058,
 0.6195367051545452,
 -1.239954515236787,
 0.7985071071171492,
 -0.5342033074974618,
 -0.5634689078183908,
 -1.071114513385274,
 1.7417599174609364,
 -0.49143050702841173,
 -0.15262490331304188,
 -0.5578409077566736,
 0.20193910057513573,
 -1.0643609133112133,
 -0.39350330595453414,
 -0.31696250511518154,
 0.09725829942719756,
 -0.21003050394255615,
 0.056736698982834455,
 -0.45203450659639205,
 2.2100095225958003,
 0.003833498402693769,
 -0.22241210407833414,
 -0.7221785095588132,
 -0.7987193103981659,
 0.2154463007232569,
 -0.01530170180714459,
 -1.152157714274,
 -0.16162970341178906,
 -0.5319521074727749,
 0.10176069947657156,
 -0.22578890411536412,
 0.4810879036363043,
 -0.3912521059298474,
 -0.3214649051645551,
 -0.12335930299211276,
 1.2926455125359113,
 -0.4216433062631197,
 -0.26180810451035363,
 -0.6580193088552382,
 -1.1375249141135357,
 1.1947183114620337,
 -0.16838330348584984,
 0.27510310137745836,
 1.4198383139307178,
 0.9864823091785008,
 -0.1965233037944354,
 -0.8156033105833173,
 -1.0688633133605872,
 1.6911079169054828,
 -1.108259313792607,
 0.6499279054878175,
 -0.03331130200463935,
 2.0512999208553775,
 0.7456039065370086,
 -0.9619313121879619,
 3.2061655335197283,
 0.056736698982834455,
 -0.6726521090157025,
 -0.9866945124595172,
 -0.16838330348584984,
 -0.24717530434988927,
 0.0330990987236225,
 -0.3754937057570394,
 -1.8815465222725374,
 0.049983098908774067,
 -0.537580107534492,
 -0.8741345112251752,
 -0.3057065049917472,
 0.7962559070924624,
 0.618411105142202,
 -0.5690969078801078,
 -1.0441001130890317,
 -0.21003050394255615,
 0.8345263075121387,
 0.33475990203165984,
 -0.280943304720192,
 -0.03893930206635614,
 -0.3518561054978275,
 -1.0936265136321421,
 1.451355114276334,
 -0.4283969063371801,
 1.6933591169301692,
 -0.7604489099784896,
 -0.16950890349819306,
 0.5542519044386269,
 0.15466390005671224,
 1.0337575096969243,
 0.3043687016983871,
 -1.5798857189645004,
 -0.39012650591750375,
 0.33363430201931626,
 2.294429523521557,
 0.8435311076108863,
 -0.8730089112128318,
 0.13777989987156108,
 -0.8246081106820644,
 -0.9495497120521843,
 -0.4959329070777853,
 2.7593023286193894,
 0.2964895016119835,
 0.1276494997604703,
 -0.49818410710247235,
 0.0792486992297028,
 0.610531905055798,
 -0.17401130354756705,
 -0.6163721083985315,
 -0.6512657087811776,
 -1.411045717112987,
 2.058053520929438,
 0.4687063035005267,
 0.6668119056729691,
 -0.2786921046955052,
 1.1418151108818928,
 -1.0283417129162238,
 -0.8279849107190949,
 0.48333910366099114,
 -0.911279311632508,
 -0.7165505094970961,
 -0.6220001084602487,
 -0.4317737063742105,
 -0.8223569106573776,
 1.1271823107214285,
 -1.2680945155453727,
 -0.5927345081393197,
 -0.946172912015154,
 0.34151350210572023,
 -0.07946090251071965,
 0.05448549895814766,
 -0.9698105122743659,
 -0.8471201109289329,
 -0.17176010352287985,
 -1.269220115557716,
 -1.0643609133112133,
 -0.6343817085960263,
 -0.42614570631249327,
 -0.745816109818025,
 -0.26068250449801045,
 1.6337023162759678,
 2.4092407247805854,
 0.8176423073269876,
 -0.3777449057817262,
 -1.2872297157552108,
 -0.12898730305382994,
 -0.8910185114103265,
 -1.1262689139901016,
 -1.3817801167920583,
 -0.6433865086947737,
 -0.7491929098550552,
 -0.284320104757222,
 0.5362423042411322,
 -0.001794501659023419,
 1.1328103107831458,
 3.194909533396294,
 0.5868943047965861,
 -0.7199273095341262,
 -0.3709913057076658,
 0.19293430047638854,
 -0.802096110435196,
 -0.40250810605328136,
 -0.6726521090157025,
 -0.2561801044486365,
 0.5328655042041018,
 0.10963989956297554,
 1.342171913079022,
 -1.0350953129902845,
 -1.0305929129409106,
 3.492067936654957,
 -0.44753210654701825,
 -1.411045717112987,
 1.35793031325183,
 -0.33384650530033266,
 1.4761183145478889,
 -0.21340730397958654,
 -0.5972369081886933,
 -1.146529714212283,
 1.6708471166833012,
 1.6730983167079878,
 0.3989191027352345,
 2.877490329915449,
 0.38090950253774014,
 2.3372023239906063,
 0.10176069947657156,
 0.1253982997357831,
 1.2014719115360943,
 -0.1841417036586578,
 0.37303030245133617,
 -0.46103930669513943,
 0.0027078983903501713,
 0.9741007090427233,
 -0.48467690695435117,
 -0.360860905596575,
 -1.3761521167303412,
 -1.0632353132988699,
 2.6253559271505225,
 -0.7638257100155198,
 -0.7064201093860053,
 -0.12110810296742595,
 -0.7930913103364486,
 -0.7638257100155198,
 -0.3811217058187566,
 0.0837510992790768,
 -0.3732425057323526,
 0.76586470675919,
 2.1323431217441033,
 0.5047255038955163,
 -0.7908401103117619,
 1.1564479110423573,
 0.6870727058951507,
 3.2129191335937883,
 -0.7334345096822474,
 0.9437095087094509,
 -0.7750817101389539,
 0.9414583086847638,
 -0.9225353117559422,
 -1.3558913165081594,
 1.1654527111411048,
 -0.8583761110523671,
 -0.7165505094970961,
 -1.261340915471312,
 -0.4283969063371801,
 -0.7165505094970961,
 -0.39575450597922096,
 -1.0913753136074553,
 0.07474629918032921,
 -0.732308909669904,
 0.2627215012416808,
 0.47545990357458756,
 -0.46103930669513943,
 -0.9202841117312553,
 -1.0148345127681029,
 -0.4790489068926342,
 -1.0936265136321421,
 -0.8088497105092567,
 1.468239114461485,
 1.8059191181645113,
 1.0405111097709852,
 0.8322751074874519,
 0.32462950192056905,
 -0.22128650406599054,
 -0.11322890288102197]
In [22]:
sns.histplot(df['total_bill'])
Out[22]:
<AxesSubplot:xlabel='total_bill', ylabel='Count'>

Feature scaling (standardization)¶

from sklearn.preprocessing import StandardScaler Z-score formula

In [48]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
In [3]:
scaler
Out[3]:
StandardScaler()
In [9]:
scaler.fit(df[['total_bill']])#it compute the mean and standard devasion
Out[9]:
StandardScaler()
In [12]:
scaler.transform(df[['total_bill']])#scaling down the z-score formula
Out[12]:
array([[-3.14711305e-01],
       [-1.06323531e+00],
       [ 1.37779900e-01],
       [ 4.38315103e-01],
       [ 5.40744704e-01],
       [ 6.19536705e-01],
       [-1.23995452e+00],
       [ 7.98507107e-01],
       [-5.34203307e-01],
       [-5.63468908e-01],
       [-1.07111451e+00],
       [ 1.74175992e+00],
       [-4.91430507e-01],
       [-1.52624903e-01],
       [-5.57840908e-01],
       [ 2.01939101e-01],
       [-1.06436091e+00],
       [-3.93503306e-01],
       [-3.16962505e-01],
       [ 9.72582994e-02],
       [-2.10030504e-01],
       [ 5.67366990e-02],
       [-4.52034507e-01],
       [ 2.21000952e+00],
       [ 3.83349840e-03],
       [-2.22412104e-01],
       [-7.22178510e-01],
       [-7.98719310e-01],
       [ 2.15446301e-01],
       [-1.53017018e-02],
       [-1.15215771e+00],
       [-1.61629703e-01],
       [-5.31952107e-01],
       [ 1.01760699e-01],
       [-2.25788904e-01],
       [ 4.81087904e-01],
       [-3.91252106e-01],
       [-3.21464905e-01],
       [-1.23359303e-01],
       [ 1.29264551e+00],
       [-4.21643306e-01],
       [-2.61808105e-01],
       [-6.58019309e-01],
       [-1.13752491e+00],
       [ 1.19471831e+00],
       [-1.68383303e-01],
       [ 2.75103101e-01],
       [ 1.41983831e+00],
       [ 9.86482309e-01],
       [-1.96523304e-01],
       [-8.15603311e-01],
       [-1.06886331e+00],
       [ 1.69110792e+00],
       [-1.10825931e+00],
       [ 6.49927905e-01],
       [-3.33113020e-02],
       [ 2.05129992e+00],
       [ 7.45603907e-01],
       [-9.61931312e-01],
       [ 3.20616553e+00],
       [ 5.67366990e-02],
       [-6.72652109e-01],
       [-9.86694512e-01],
       [-1.68383303e-01],
       [-2.47175304e-01],
       [ 3.30990987e-02],
       [-3.75493706e-01],
       [-1.88154652e+00],
       [ 4.99830989e-02],
       [-5.37580108e-01],
       [-8.74134511e-01],
       [-3.05706505e-01],
       [ 7.96255907e-01],
       [ 6.18411105e-01],
       [-5.69096908e-01],
       [-1.04410011e+00],
       [-2.10030504e-01],
       [ 8.34526308e-01],
       [ 3.34759902e-01],
       [-2.80943305e-01],
       [-3.89393021e-02],
       [-3.51856105e-01],
       [-1.09362651e+00],
       [ 1.45135511e+00],
       [-4.28396906e-01],
       [ 1.69335912e+00],
       [-7.60448910e-01],
       [-1.69508903e-01],
       [ 5.54251904e-01],
       [ 1.54663900e-01],
       [ 1.03375751e+00],
       [ 3.04368702e-01],
       [-1.57988572e+00],
       [-3.90126506e-01],
       [ 3.33634302e-01],
       [ 2.29442952e+00],
       [ 8.43531108e-01],
       [-8.73008911e-01],
       [ 1.37779900e-01],
       [-8.24608111e-01],
       [-9.49549712e-01],
       [-4.95932907e-01],
       [ 2.75930233e+00],
       [ 2.96489502e-01],
       [ 1.27649500e-01],
       [-4.98184107e-01],
       [ 7.92486992e-02],
       [ 6.10531905e-01],
       [-1.74011304e-01],
       [-6.16372108e-01],
       [-6.51265709e-01],
       [-1.41104572e+00],
       [ 2.05805352e+00],
       [ 4.68706304e-01],
       [ 6.66811906e-01],
       [-2.78692105e-01],
       [ 1.14181511e+00],
       [-1.02834171e+00],
       [-8.27984911e-01],
       [ 4.83339104e-01],
       [-9.11279312e-01],
       [-7.16550509e-01],
       [-6.22000108e-01],
       [-4.31773706e-01],
       [-8.22356911e-01],
       [ 1.12718231e+00],
       [-1.26809452e+00],
       [-5.92734508e-01],
       [-9.46172912e-01],
       [ 3.41513502e-01],
       [-7.94609025e-02],
       [ 5.44854990e-02],
       [-9.69810512e-01],
       [-8.47120111e-01],
       [-1.71760104e-01],
       [-1.26922012e+00],
       [-1.06436091e+00],
       [-6.34381709e-01],
       [-4.26145706e-01],
       [-7.45816110e-01],
       [-2.60682504e-01],
       [ 1.63370232e+00],
       [ 2.40924072e+00],
       [ 8.17642307e-01],
       [-3.77744906e-01],
       [-1.28722972e+00],
       [-1.28987303e-01],
       [-8.91018511e-01],
       [-1.12626891e+00],
       [-1.38178012e+00],
       [-6.43386509e-01],
       [-7.49192910e-01],
       [-2.84320105e-01],
       [ 5.36242304e-01],
       [-1.79450166e-03],
       [ 1.13281031e+00],
       [ 3.19490953e+00],
       [ 5.86894305e-01],
       [-7.19927310e-01],
       [-3.70991306e-01],
       [ 1.92934300e-01],
       [-8.02096110e-01],
       [-4.02508106e-01],
       [-6.72652109e-01],
       [-2.56180104e-01],
       [ 5.32865504e-01],
       [ 1.09639900e-01],
       [ 1.34217191e+00],
       [-1.03509531e+00],
       [-1.03059291e+00],
       [ 3.49206794e+00],
       [-4.47532107e-01],
       [-1.41104572e+00],
       [ 1.35793031e+00],
       [-3.33846505e-01],
       [ 1.47611831e+00],
       [-2.13407304e-01],
       [-5.97236908e-01],
       [-1.14652971e+00],
       [ 1.67084712e+00],
       [ 1.67309832e+00],
       [ 3.98919103e-01],
       [ 2.87749033e+00],
       [ 3.80909503e-01],
       [ 2.33720232e+00],
       [ 1.01760699e-01],
       [ 1.25398300e-01],
       [ 1.20147191e+00],
       [-1.84141704e-01],
       [ 3.73030302e-01],
       [-4.61039307e-01],
       [ 2.70789839e-03],
       [ 9.74100709e-01],
       [-4.84676907e-01],
       [-3.60860906e-01],
       [-1.37615212e+00],
       [-1.06323531e+00],
       [ 2.62535593e+00],
       [-7.63825710e-01],
       [-7.06420109e-01],
       [-1.21108103e-01],
       [-7.93091310e-01],
       [-7.63825710e-01],
       [-3.81121706e-01],
       [ 8.37510993e-02],
       [-3.73242506e-01],
       [ 7.65864707e-01],
       [ 2.13234312e+00],
       [ 5.04725504e-01],
       [-7.90840110e-01],
       [ 1.15644791e+00],
       [ 6.87072706e-01],
       [ 3.21291913e+00],
       [-7.33434510e-01],
       [ 9.43709509e-01],
       [-7.75081710e-01],
       [ 9.41458309e-01],
       [-9.22535312e-01],
       [-1.35589132e+00],
       [ 1.16545271e+00],
       [-8.58376111e-01],
       [-7.16550509e-01],
       [-1.26134092e+00],
       [-4.28396906e-01],
       [-7.16550509e-01],
       [-3.95754506e-01],
       [-1.09137531e+00],
       [ 7.47462992e-02],
       [-7.32308910e-01],
       [ 2.62721501e-01],
       [ 4.75459904e-01],
       [-4.61039307e-01],
       [-9.20284112e-01],
       [-1.01483451e+00],
       [-4.79048907e-01],
       [-1.09362651e+00],
       [-8.08849711e-01],
       [ 1.46823911e+00],
       [ 1.80591912e+00],
       [ 1.04051111e+00],
       [ 8.32275107e-01],
       [ 3.24629502e-01],
       [-2.21286504e-01],
       [-1.13228903e-01]])
In [15]:
df.head()
Out[15]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [17]:
import pandas as pd
pd.DataFrame(scaler.fit_transform(df[['total_bill','tip']]),columns=['total_bill','tip'])#in one single line
Out[17]:
total_bill tip
0 -0.314711 -1.439947
1 -1.063235 -0.969205
2 0.137780 0.363356
3 0.438315 0.225754
4 0.540745 0.443020
... ... ...
239 1.040511 2.115963
240 0.832275 -0.722971
241 0.324630 -0.722971
242 -0.221287 -0.904026
243 -0.113229 0.001247

244 rows × 2 columns

In [18]:
scaler.transform([[13,4]])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Out[18]:
array([[-0.76382571,  0.72546447]])

Normaliztion--Min MAx Scaler¶

xi-xmin/xmax-xmin

In [29]:
df=sns.load_dataset('taxis')
In [30]:
df.head()
Out[30]:
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
3 2019-03-10 01:23:59 2019-03-10 01:49:51 1 7.70 27.0 6.15 0.0 36.95 yellow credit card Hudson Sq Yorkville West Manhattan Manhattan
4 2019-03-30 13:27:42 2019-03-30 13:37:14 3 2.16 9.0 1.10 0.0 13.40 yellow credit card Midtown East Yorkville West Manhattan Manhattan
In [31]:
from sklearn.preprocessing import MinMaxScaler
In [34]:
min_max=MinMaxScaler() #standarzation
In [36]:
min_max.fit_transform(df[['distance','fare','tip']]) #see it will be in between 0,1
Out[36]:
array([[0.04359673, 0.04026846, 0.06475904],
       [0.02152589, 0.02684564, 0.        ],
       [0.0373297 , 0.04362416, 0.07108434],
       ...,
       [0.11280654, 0.10067114, 0.        ],
       [0.03051771, 0.03355705, 0.        ],
       [0.10490463, 0.09395973, 0.10120482]])
In [37]:
min_max.fit(df[['distance','fare','tip']])
Out[37]:
MinMaxScaler()
In [39]:
min_max.transform([[1.6,7.0,2.15]])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but MinMaxScaler was fitted with feature names
  warnings.warn(
Out[39]:
array([[0.04359673, 0.04026846, 0.06475904]])

unit vector¶

under root of asq2+bsq2

In [40]:
from sklearn.preprocessing import normalize
In [46]:
import pandas as pd
unit_vector=pd.DataFrame(normalize(df[['distance','fare','tip']])) #under root of a square + b square
In [47]:
unit_vector
Out[47]:
0 1 2
0 0.213461 0.933894 0.286839
1 0.156064 0.987747 0.000000
2 0.171657 0.939731 0.295702
3 0.267899 0.939386 0.213971
4 0.231742 0.965592 0.118017
... ... ... ...
6428 0.160133 0.960800 0.226322
6429 0.307453 0.951563 0.000000
6430 0.250500 0.968117 0.000000
6431 0.183497 0.983020 0.000000
6432 0.242956 0.946580 0.212034

6433 rows × 3 columns

Encoding¶

Norminal/OHE Encoding

In [49]:
import pandas as pd
In [50]:
from sklearn.preprocessing import OneHotEncoder
In [52]:
#Create a simple dataframe
df=pd.DataFrame({'color':['red','blue','green','green','red','blue']})
In [53]:
df.head()
Out[53]:
color
0 red
1 blue
2 green
3 green
4 red
In [54]:
#create an instance of onehotencoder
encoder=OneHotEncoder()
In [57]:
encoded=encoder.fit_transform(df[['color']]).toarray() #this gives binary numbers
In [58]:
import pandas as pd
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
In [59]:
encoder_df
Out[59]:
color_blue color_green color_red
0 0.0 0.0 1.0
1 1.0 0.0 0.0
2 0.0 1.0 0.0
3 0.0 1.0 0.0
4 0.0 0.0 1.0
5 1.0 0.0 0.0
In [60]:
df.head()
Out[60]:
color
0 red
1 blue
2 green
3 green
4 red

Label Encoder¶

It assigns unique values to each other

In [61]:
from sklearn.preprocessing import LabelEncoder
In [62]:
lbl_encoder=LabelEncoder() #instance
In [63]:
lbl_encoder.fit_transform(df[['color']]) #this gives unique values
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:115: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[63]:
array([2, 0, 1, 1, 2, 0])
In [64]:
lbl_encoder.transform([['red']]) #afterwards if data may come so use just this to encode a new 
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[64]:
array([2])

Ordinal Encoding¶

It having instrinsic order or ranking.In this techique,each category is assigned a numerical value based on its position in the order. if e have catergorical variarbles 1.High school:1 2.College:2 3.Graduate:3 4.Post-gradute:4

In [65]:
#ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
In [66]:
df=pd.DataFrame({'size':['small','medium','large','medium','small','large']})
In [67]:
df
Out[67]:
size
0 small
1 medium
2 large
3 medium
4 small
5 large
In [68]:
#create as instance of ordinalencoder and then fit_transform
encoder=OrdinalEncoder(categories=[['small','medium','large']])
In [69]:
encoder.fit_transform(df[['size']])
Out[69]:
array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])
In [70]:
encoder.transform([['small']])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but OrdinalEncoder was fitted with feature names
  warnings.warn(
Out[70]:
array([[0.]])

Traget Guided Ordinal Encoding¶

It is technique used to encode based on the relationship with the target variable.This encoding technique is useful when have a categorical variable with a large number of unique categories We can replace category in the categorical variable witha numerical value based on the mean or median of the target variable for that category

In [72]:
#create a simple dataset
import pandas as pd
df=pd.DataFrame({'city':['New York','London','Paris','Tokyo','New York','Paris'],
                'price':[200,150,300,250,180,320]})
In [73]:
df
Out[73]:
city price
0 New York 200
1 London 150
2 Paris 300
3 Tokyo 250
4 New York 180
5 Paris 320
In [75]:
mean_price=df.groupby('city')['price'].mean().to_dict()

mean_price

In [76]:
mean_price
Out[76]:
{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}
In [77]:
df['city_encoded']=df['city'].map(mean_price)
In [78]:
df
Out[78]:
city price city_encoded
0 New York 200 190.0
1 London 150 150.0
2 Paris 300 310.0
3 Tokyo 250 250.0
4 New York 180 190.0
5 Paris 320 310.0
In [79]:
import seaborn as sns
In [81]:
df=sns.load_dataset('tips')
In [82]:
df.head()
Out[82]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [92]:
mean_totalbill=df.groupby('time')['total_bill'].mean().to_dict()
In [93]:
mean_totalbill
Out[93]:
{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}
In [94]:
df['encoded_time']=df['time'].map(mean_totalbill)
In [95]:
df
Out[95]:
total_bill tip sex smoker day time size encoded_time
0 16.99 1.01 Female No Sun Dinner 2 20.797159
1 10.34 1.66 Male No Sun Dinner 3 20.797159
2 21.01 3.50 Male No Sun Dinner 3 20.797159
3 23.68 3.31 Male No Sun Dinner 2 20.797159
4 24.59 3.61 Female No Sun Dinner 4 20.797159
... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 20.797159
240 27.18 2.00 Female Yes Sat Dinner 2 20.797159
241 22.67 2.00 Male Yes Sat Dinner 2 20.797159
242 17.82 1.75 Male No Sat Dinner 2 20.797159
243 18.78 3.00 Female No Thur Dinner 2 20.797159

244 rows × 8 columns

In [109]:
import pandas as pd
df=pd.read_csv("winequality-red.csv")
df.head()
Out[109]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [97]:
#Summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
In [98]:
#descriptive summary of the dataset
df.describe()
Out[98]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806 0.087467 15.874922 46.467792 0.996747 3.311113 0.658149 10.422983 5.636023
std 1.741096 0.179060 0.194801 1.409928 0.047065 10.460157 32.895324 0.001887 0.154386 0.169507 1.065668 0.807569
min 4.600000 0.120000 0.000000 0.900000 0.012000 1.000000 6.000000 0.990070 2.740000 0.330000 8.400000 3.000000
25% 7.100000 0.390000 0.090000 1.900000 0.070000 7.000000 22.000000 0.995600 3.210000 0.550000 9.500000 5.000000
50% 7.900000 0.520000 0.260000 2.200000 0.079000 14.000000 38.000000 0.996750 3.310000 0.620000 10.200000 6.000000
75% 9.200000 0.640000 0.420000 2.600000 0.090000 21.000000 62.000000 0.997835 3.400000 0.730000 11.100000 6.000000
max 15.900000 1.580000 1.000000 15.500000 0.611000 72.000000 289.000000 1.003690 4.010000 2.000000 14.900000 8.000000
In [99]:
df.shape
Out[99]:
(1599, 12)
In [100]:
df.columns
Out[100]:
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
In [102]:
df['quality'].unique()
Out[102]:
array([5, 6, 7, 4, 8, 3], dtype=int64)
In [103]:
#missing values in he dataset
df.isnull().sum()
Out[103]:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [105]:
df.quality.value_counts().plot(kind='bar')
plt.xlabel('Wine Quality')
plt.ylabel("Count")
Out[105]:
<AxesSubplot:>
In [106]:
df.head()
Out[106]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [107]:
for column in df.columns:
    sns.histplot(df[column],kde=True)
In [108]:
sns.histplot(df['alcohol'])
Out[108]:
<AxesSubplot:xlabel='alcohol', ylabel='Count'>
In [ ]:
#univariate,bivariate,multivariate analysis
In [110]:
sns.pairplot(df)
Out[110]:
<seaborn.axisgrid.PairGrid at 0x173ca389730>
In [111]:
sns.catplot(x='quality',y='alcohol',data=df,kind='box')
Out[111]:
<seaborn.axisgrid.FacetGrid at 0x173d0633880>
In [113]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [114]:
df=pd.read_csv('stud.csv')
df.head()
Out[114]:
gender race_ethnicity parental_level_of_education lunch test_preparation_course math_score reading_score writing_score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75

Data Check to perform¶

  • Check Missing values
  • Check Duplicates
  • Check data type Check the number of unique values of each columnCheck statistics of dataset *Check various categories present in the different categorical column
In [115]:
##Checking missing values
df.isnull().sum()
Out[115]:
gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64
In [116]:
df.isna().sum()
Out[116]:
gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64
In [118]:
##Check Duplicates
df.duplicated().sum()
Out[118]:
0
In [119]:
#check datatypes
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
In [121]:
##3.1 Checking the number of uniques,values of each columns
df.nunique()
Out[121]:
gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64
In [122]:
#check the statistices of the datset
df.describe()
Out[122]:
math_score reading_score writing_score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
In [123]:
#Explore more info about the data
df.head()
Out[123]:
gender race_ethnicity parental_level_of_education lunch test_preparation_course math_score reading_score writing_score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
In [127]:
#segrregate numerical and categorical features
[feature for feature in df.columns]# it gives all columns
Out[127]:
['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course',
 'math_score',
 'reading_score',
 'writing_score']
In [17]:
numerical_feature=[feature for feature in df.columns if df[feature].dtype!='O']#it gives all int columns
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
In [131]:
numerical_feature
Out[131]:
['math_score', 'reading_score', 'writing_score']
In [132]:
categorical_feature
Out[132]:
['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']
In [133]:
#Aggregate the total score with mean
df['total_score']=(df['math_score']+df['reading_score']+df['writing_score'])
df['average']=df['total_score']/3
df.head()
Out[133]:
gender race_ethnicity parental_level_of_education lunch test_preparation_course math_score reading_score writing_score total_score average
0 female group B bachelor's degree standard none 72 72 74 218 72.666667
1 female group C some college standard completed 69 90 88 247 82.333333
2 female group B master's degree standard none 90 95 93 278 92.666667
3 male group A associate's degree free/reduced none 47 57 44 148 49.333333
4 male group C some college standard none 76 78 75 229 76.333333
In [136]:
#Explore More Visualization
fig,axis=plt.subplots(1,2,figsize=(15,7))
plt.subplot(121)
sns.histplot(data=df,x='average',bins=30,kde=True,color='g')
plt.subplot(122)
sns.histplot(data=df,x='average',bins=30,kde=True,hue='gender')
plt.show()

Zomato Dataset Exploratory Data Analysis¶

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [147]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [147], in <cell line: 3>()
      1 import sys  
----> 3 reload(sys)  
      4 sys.setdefaultencoding('utf8')

NameError: name 'reload' is not defined
In [17]:
df=pd.read_csv('zomato.csv',encoding='latin-1')
In [18]:
df.head()
Out[18]:
Restaurant ID Restaurant Name Country Code City Address Locality Locality Verbose Longitude Latitude Cuisines ... Currency Has Table booking Has Online delivery Is delivering now Switch to order menu Price range Aggregate rating Rating color Rating text Votes
0 6317637 Le Petit Souffle 162 Makati City Third Floor, Century City Mall, Kalayaan Avenu... Century City Mall, Poblacion, Makati City Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443 French, Japanese, Desserts ... Botswana Pula(P) Yes No No No 3 4.8 Dark Green Excellent 314
1 6304287 Izakaya Kikufuji 162 Makati City Little Tokyo, 2277 Chino Roces Avenue, Legaspi... Little Tokyo, Legaspi Village, Makati City Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708 Japanese ... Botswana Pula(P) Yes No No No 3 4.5 Dark Green Excellent 591
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... Edsa Shangri-La, Ortigas, Mandaluyong City Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404 Seafood, Asian, Filipino, Indian ... Botswana Pula(P) Yes No No No 4 4.4 Green Very Good 270
3 6318506 Ooma 162 Mandaluyong City Third Floor, Mega Fashion Hall, SM Megamall, O... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318 Japanese, Sushi ... Botswana Pula(P) No No No No 4 4.9 Dark Green Excellent 365
4 6314302 Sambo Kojin 162 Mandaluyong City Third Floor, Mega Atrium, SM Megamall, Ortigas... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450 Japanese, Korean ... Botswana Pula(P) Yes No No No 4 4.8 Dark Green Excellent 229

5 rows × 21 columns

In [19]:
df.columns
Out[19]:
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')
In [20]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu  9551 non-null   object 
 16  Price range           9551 non-null   int64  
 17  Aggregate rating      9551 non-null   float64
 18  Rating color          9551 non-null   object 
 19  Rating text           9551 non-null   object 
 20  Votes                 9551 non-null   int64  
dtypes: float64(3), int64(5), object(13)
memory usage: 1.5+ MB
In [21]:
 df.describe()
Out[21]:
Restaurant ID Country Code Longitude Latitude Average Cost for two Price range Aggregate rating Votes
count 9.551000e+03 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000 9551.000000
mean 9.051128e+06 18.365616 64.126574 25.854381 1199.210763 1.804837 2.666370 156.909748
std 8.791521e+06 56.750546 41.467058 11.007935 16121.183073 0.905609 1.516378 430.169145
min 5.300000e+01 1.000000 -157.948486 -41.330428 0.000000 1.000000 0.000000 0.000000
25% 3.019625e+05 1.000000 77.081343 28.478713 250.000000 1.000000 2.500000 5.000000
50% 6.004089e+06 1.000000 77.191964 28.570469 400.000000 2.000000 3.200000 31.000000
75% 1.835229e+07 1.000000 77.282006 28.642758 700.000000 2.000000 3.700000 131.000000
max 1.850065e+07 216.000000 174.832089 55.976980 800000.000000 4.000000 4.900000 10934.000000

In data Analysis¶

  • Missing Values Explore about the Numerical VariablesExplore ABout categorical Variable
  • Finding Relationship between Feature
In [70]:
df.isnull().sum()
Out[70]:
Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64
In [10]:
[features for features in df.columns if df[features].isnull().sum()>0]
Out[10]:
['Cuisines']
In [ ]:
 
In [14]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[14]:
<AxesSubplot:>
In [69]:
df_country=pd.read_excel('Country-Code.xlsx')
df_country.head()
Out[69]:
Country Code Country
0 1 India
1 14 Australia
2 30 Brazil
3 37 Canada
4 94 Indonesia
In [24]:
df.columns
Out[24]:
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')
In [28]:
#combine the dataset
final_df=pd.merge(df,df_country,on='Country Code',how='left')
In [29]:
final_df
Out[29]:
Restaurant ID Restaurant Name Country Code City Address Locality Locality Verbose Longitude Latitude Cuisines ... Has Table booking Has Online delivery Is delivering now Switch to order menu Price range Aggregate rating Rating color Rating text Votes Country
0 6317637 Le Petit Souffle 162 Makati City Third Floor, Century City Mall, Kalayaan Avenu... Century City Mall, Poblacion, Makati City Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443 French, Japanese, Desserts ... Yes No No No 3 4.8 Dark Green Excellent 314 Phillipines
1 6304287 Izakaya Kikufuji 162 Makati City Little Tokyo, 2277 Chino Roces Avenue, Legaspi... Little Tokyo, Legaspi Village, Makati City Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708 Japanese ... Yes No No No 3 4.5 Dark Green Excellent 591 Phillipines
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... Edsa Shangri-La, Ortigas, Mandaluyong City Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404 Seafood, Asian, Filipino, Indian ... Yes No No No 4 4.4 Green Very Good 270 Phillipines
3 6318506 Ooma 162 Mandaluyong City Third Floor, Mega Fashion Hall, SM Megamall, O... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318 Japanese, Sushi ... No No No No 4 4.9 Dark Green Excellent 365 Phillipines
4 6314302 Sambo Kojin 162 Mandaluyong City Third Floor, Mega Atrium, SM Megamall, Ortigas... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450 Japanese, Korean ... Yes No No No 4 4.8 Dark Green Excellent 229 Phillipines
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9546 5915730 NamlÛ± Gurme 208 ÛÁstanbul Kemankeô Karamustafa Paôa Mahallesi, RÛ±htÛ±... Karakí_y Karakí_y, ÛÁstanbul 28.977392 41.022793 Turkish ... No No No No 3 4.1 Green Very Good 788 Turkey
9547 5908749 Ceviz AÛôacÛ± 208 ÛÁstanbul Koôuyolu Mahallesi, Muhittin íìstí_ndaÛô Cadd... Koôuyolu Koôuyolu, ÛÁstanbul 29.041297 41.009847 World Cuisine, Patisserie, Cafe ... No No No No 3 4.2 Green Very Good 1034 Turkey
9548 5915807 Huqqa 208 ÛÁstanbul Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N... Kuruí_eôme Kuruí_eôme, ÛÁstanbul 29.034640 41.055817 Italian, World Cuisine ... No No No No 4 3.7 Yellow Good 661 Turkey
9549 5916112 Aôôk Kahve 208 ÛÁstanbul Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N... Kuruí_eôme Kuruí_eôme, ÛÁstanbul 29.036019 41.057979 Restaurant Cafe ... No No No No 4 4.0 Green Very Good 901 Turkey
9550 5927402 Walter's Coffee Roastery 208 ÛÁstanbul CafeaÛôa Mahallesi, BademaltÛ± Sokak, No 21/B,... Moda Moda, ÛÁstanbul 29.026016 40.984776 Cafe ... No No No No 2 4.0 Green Very Good 591 Turkey

9551 rows × 22 columns

In [30]:
##to check Data Types
final_df.dtypes
Out[30]:
Restaurant ID             int64
Restaurant Name          object
Country Code              int64
City                     object
Address                  object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking        object
Has Online delivery      object
Is delivering now        object
Switch to order menu     object
Price range               int64
Aggregate rating        float64
Rating color             object
Rating text              object
Votes                     int64
Country                  object
dtype: object
In [32]:
country_names=final_df.Country.value_counts().index
In [35]:
country_val=final_df.Country.value_counts().values
In [41]:
##Pie Chart-top 3 countries that usese zomato
plt.pie(country_val[0:3],labels=country_names[0:3],autopct="%1.2f%%")
Out[41]:
([<matplotlib.patches.Wedge at 0x1cd91dd71c0>,
  <matplotlib.patches.Wedge at 0x1cd91dd7850>,
  <matplotlib.patches.Wedge at 0x1cd91dd7f70>],
 [Text(-1.0829742700952103, 0.19278674827836725, 'India'),
  Text(1.077281715838356, -0.22240527134123297, 'United States'),
  Text(1.0995865153823035, -0.03015783794312073, 'United Kingdom')],
 [Text(-0.590713238233751, 0.10515640815183668, '94.39%'),
  Text(0.5876082086391032, -0.12131196618612707, '4.73%'),
  Text(0.5997744629358018, -0.01644972978715676, '0.87%')])
In [42]:
final_df.columns
Out[42]:
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes', 'Country'],
      dtype='object')
In [48]:
ratings=final_df.groupby(['Aggregate rating','Rating color','Rating text']).size().reset_index().rename(columns={0:'Rating Count'})
In [49]:
ratings
Out[49]:
Aggregate rating Rating color Rating text Rating Count
0 0.0 White Not rated 2148
1 1.8 Red Poor 1
2 1.9 Red Poor 2
3 2.0 Red Poor 7
4 2.1 Red Poor 15
5 2.2 Red Poor 27
6 2.3 Red Poor 47
7 2.4 Red Poor 87
8 2.5 Orange Average 110
9 2.6 Orange Average 191
10 2.7 Orange Average 250
11 2.8 Orange Average 315
12 2.9 Orange Average 381
13 3.0 Orange Average 468
14 3.1 Orange Average 519
15 3.2 Orange Average 522
16 3.3 Orange Average 483
17 3.4 Orange Average 498
18 3.5 Yellow Good 480
19 3.6 Yellow Good 458
20 3.7 Yellow Good 427
21 3.8 Yellow Good 400
22 3.9 Yellow Good 335
23 4.0 Green Very Good 266
24 4.1 Green Very Good 274
25 4.2 Green Very Good 221
26 4.3 Green Very Good 174
27 4.4 Green Very Good 144
28 4.5 Dark Green Excellent 95
29 4.6 Dark Green Excellent 78
30 4.7 Dark Green Excellent 42
31 4.8 Dark Green Excellent 25
32 4.9 Dark Green Excellent 61
In [51]:
ratings.head()
Out[51]:
Aggregate rating Rating color Rating text Rating Count
0 0.0 White Not rated 2148
1 1.8 Red Poor 1
2 1.9 Red Poor 2
3 2.0 Red Poor 7
4 2.1 Red Poor 15
In [60]:
import matplotlib
matplotlib.rcParams['figure.figsize']=(12,6)
sns.barplot(x='Aggregate rating',y='Rating Count',hue='Rating color',data=ratings,palette=['blue','red','orange','yellow','green','green'])
Out[60]:
<AxesSubplot:xlabel='Aggregate rating', ylabel='Rating Count'>
In [62]:
##Count plot
sns.countplot(x='Rating color',data=ratings,palette=['blue','red','orange','yellow','green','green'])
Out[62]:
<AxesSubplot:xlabel='Rating color', ylabel='count'>
In [65]:
##Find the counties name that has given 0 rating
final_df.groupby(['Aggregate rating','Country']).size().reset_index().head(5)
Out[65]:
Aggregate rating Country 0
0 0.0 Brazil 5
1 0.0 India 2139
2 0.0 United Kingdom 1
3 0.0 United States 3
4 1.8 India 1
In [66]:
##find out which currency is used by which country
final_df[['Country','Currency']].groupby(['Country','Currency']).size().reset_index()
Out[66]:
Country Currency 0
0 Australia Dollar($) 24
1 Brazil Brazilian Real(R$) 60
2 Canada Dollar($) 4
3 India Indian Rupees(Rs.) 8652
4 Indonesia Indonesian Rupiah(IDR) 21
5 New Zealand NewZealand($) 40
6 Phillipines Botswana Pula(P) 22
7 Qatar Qatari Rial(QR) 20
8 Singapore Dollar($) 20
9 South Africa Rand(R) 60
10 Sri Lanka Sri Lankan Rupee(LKR) 20
11 Turkey Turkish Lira(TL) 34
12 UAE Emirati Diram(AED) 60
13 United Kingdom Pounds(Σ) 80
14 United States Dollar($) 434
In [67]:
##Which countries do have online deleviesrs option
final_df[final_df['Has Online delivery']=="Yes"].Country.value_counts()
Out[67]:
India    2423
UAE        28
Name: Country, dtype: int64
In [68]:
##drop city category Feature
df.drop('City_Category',axis=1,inplace=True)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [68], in <cell line: 2>()
      1 ##drop city category Feature
----> 2 df.drop('City_Category',axis=1,inplace=True)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4954, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   4806 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
   4807 def drop(
   4808     self,
   (...)
   4815     errors: str = "raise",
   4816 ):
   4817     """
   4818     Drop specified labels from rows or columns.
   4819 
   (...)
   4952             weight  1.0     0.8
   4953     """
-> 4954     return super().drop(
   4955         labels=labels,
   4956         axis=axis,
   4957         index=index,
   4958         columns=columns,
   4959         level=level,
   4960         inplace=inplace,
   4961         errors=errors,
   4962     )

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:4267, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   4265 for axis, labels in axes.items():
   4266     if labels is not None:
-> 4267         obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4269 if inplace:
   4270     self._update_inplace(obj)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:4311, in NDFrame._drop_axis(self, labels, axis, level, errors, consolidate, only_slice)
   4309         new_axis = axis.drop(labels, level=level, errors=errors)
   4310     else:
-> 4311         new_axis = axis.drop(labels, errors=errors)
   4312     indexer = axis.get_indexer(new_axis)
   4314 # Case for non-unique axis
   4315 else:

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:6644, in Index.drop(self, labels, errors)
   6642 if mask.any():
   6643     if errors != "ignore":
-> 6644         raise KeyError(f"{list(labels[mask])} not found in axis")
   6645     indexer = indexer[~mask]
   6646 return self.delete(indexer)

KeyError: "['City_Category'] not found in axis"
In [71]:
df.columns
Out[71]:
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')
In [73]:
df_train=pd.read_csv('train.csv')
In [74]:
df_train
Out[74]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
... ... ... ... ... ... ... ... ... ... ... ... ...
550063 1006033 P00372445 M 51-55 13 B 1 1 20 NaN NaN 368
550064 1006035 P00375436 F 26-35 1 C 3 0 20 NaN NaN 371
550065 1006036 P00375436 F 26-35 15 B 4+ 1 20 NaN NaN 137
550066 1006038 P00375436 F 55+ 1 C 2 0 20 NaN NaN 365
550067 1006039 P00371644 F 46-50 0 B 4+ 1 20 NaN NaN 490

550068 rows × 12 columns

In [99]:
##import  the test data
df_test=pd.read_csv('test.csv')
In [100]:
df_test
Out[100]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
0 1000004 P00128942 M 46-50 7 B 2 1 1 11.0 NaN
1 1000009 P00113442 M 26-35 17 C 0 0 3 5.0 NaN
2 1000010 P00288442 F 36-45 1 B 4+ 1 5 14.0 NaN
3 1000010 P00145342 F 36-45 1 B 4+ 1 4 9.0 NaN
4 1000011 P00053842 F 26-35 1 C 1 0 4 5.0 12.0
... ... ... ... ... ... ... ... ... ... ... ...
233594 1006036 P00118942 F 26-35 15 B 4+ 1 8 NaN NaN
233595 1006036 P00254642 F 26-35 15 B 4+ 1 5 8.0 NaN
233596 1006036 P00031842 F 26-35 15 B 4+ 1 1 5.0 12.0
233597 1006037 P00124742 F 46-50 1 C 4+ 0 10 16.0 NaN
233598 1006039 P00316642 F 46-50 0 B 4+ 1 4 5.0 NaN

233599 rows × 11 columns

In [101]:
#merge both train and test data
df=df_train.append(df_test)
C:\Users\parsi\AppData\Local\Temp\ipykernel_7132\4047838133.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df=df_train.append(df_test)
In [102]:
df
Out[102]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370.0
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200.0
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422.0
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057.0
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969.0
... ... ... ... ... ... ... ... ... ... ... ... ...
233594 1006036 P00118942 F 26-35 15 B 4+ 1 8 NaN NaN NaN
233595 1006036 P00254642 F 26-35 15 B 4+ 1 5 8.0 NaN NaN
233596 1006036 P00031842 F 26-35 15 B 4+ 1 1 5.0 12.0 NaN
233597 1006037 P00124742 F 46-50 1 C 4+ 0 10 16.0 NaN NaN
233598 1006039 P00316642 F 46-50 0 B 4+ 1 4 5.0 NaN NaN

783667 rows × 12 columns

In [103]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 77.7+ MB
In [104]:
df.describe()
Out[104]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
count 7.836670e+05 783667.000000 783667.000000 783667.000000 537685.000000 237858.000000 550068.000000
mean 1.003029e+06 8.079300 0.409777 5.366196 9.844506 12.668605 9263.968713
std 1.727267e+03 6.522206 0.491793 3.878160 5.089093 4.125510 5023.065394
min 1.000001e+06 0.000000 0.000000 1.000000 2.000000 3.000000 12.000000
25% 1.001519e+06 2.000000 0.000000 1.000000 5.000000 9.000000 5823.000000
50% 1.003075e+06 7.000000 0.000000 5.000000 9.000000 14.000000 8047.000000
75% 1.004478e+06 14.000000 1.000000 8.000000 15.000000 16.000000 12054.000000
max 1.006040e+06 20.000000 1.000000 20.000000 18.000000 18.000000 23961.000000
In [86]:
df.drop(['User_ID'],axis=1,inplace=True)
In [87]:
df.head()
Out[87]:
Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370.0
1 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200.0
2 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422.0
3 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057.0
4 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969.0
In [88]:
#Convert Gender catagorical to numerical 
df['Gender']=df['Gender'].map({'F':0,'M':1})
In [89]:
df
Out[89]:
Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 P00069042 0 0-17 10 A 2 0 3 NaN NaN 8370.0
1 P00248942 0 0-17 10 A 2 0 1 6.0 14.0 15200.0
2 P00087842 0 0-17 10 A 2 0 12 NaN NaN 1422.0
3 P00085442 0 0-17 10 A 2 0 12 14.0 NaN 1057.0
4 P00285442 1 55+ 16 C 4+ 0 8 NaN NaN 7969.0
... ... ... ... ... ... ... ... ... ... ... ...
233594 P00118942 0 26-35 15 B 4+ 1 8 NaN NaN NaN
233595 P00254642 0 26-35 15 B 4+ 1 5 8.0 NaN NaN
233596 P00031842 0 26-35 15 B 4+ 1 1 5.0 12.0 NaN
233597 P00124742 0 46-50 1 C 4+ 0 10 16.0 NaN NaN
233598 P00316642 0 46-50 0 B 4+ 1 4 5.0 NaN NaN

783667 rows × 11 columns

In [136]:
#HAndel age catageriocal feature to  numnerical
df['Age'].unique()
Out[136]:
array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)
In [137]:
df['Age']=df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})
In [138]:
df
Out[138]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase B C
0 1000001 P00069042 F 1 10 2 0 3 8.0 16.0 8370.0 0 0
1 1000001 P00248942 F 1 10 2 0 1 6.0 14.0 15200.0 0 0
2 1000001 P00087842 F 1 10 2 0 12 8.0 16.0 1422.0 0 0
3 1000001 P00085442 F 1 10 2 0 12 14.0 16.0 1057.0 0 0
4 1000002 P00285442 M 7 16 4 0 8 8.0 16.0 7969.0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
233594 1006036 P00118942 F 3 15 4 1 8 8.0 16.0 NaN 1 0
233595 1006036 P00254642 F 3 15 4 1 5 8.0 16.0 NaN 1 0
233596 1006036 P00031842 F 3 15 4 1 1 5.0 12.0 NaN 1 0
233597 1006037 P00124742 F 5 1 4 0 10 16.0 16.0 NaN 0 1
233598 1006039 P00316642 F 5 0 4 1 4 5.0 16.0 NaN 1 0

783667 rows × 13 columns

In [95]:
#Second Techique
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['Age']=label_encoder.fit_transform(df['Age'])
df['Age'].unique()
Out[95]:
array([0, 6, 2, 4, 5, 3, 1], dtype=int64)
In [96]:
df
Out[96]:
Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 P00069042 0 0 10 A 2 0 3 NaN NaN 8370.0
1 P00248942 0 0 10 A 2 0 1 6.0 14.0 15200.0
2 P00087842 0 0 10 A 2 0 12 NaN NaN 1422.0
3 P00085442 0 0 10 A 2 0 12 14.0 NaN 1057.0
4 P00285442 1 6 16 C 4+ 0 8 NaN NaN 7969.0
... ... ... ... ... ... ... ... ... ... ... ...
233594 P00118942 0 2 15 B 4+ 1 8 NaN NaN NaN
233595 P00254642 0 2 15 B 4+ 1 5 8.0 NaN NaN
233596 P00031842 0 2 15 B 4+ 1 1 5.0 12.0 NaN
233597 P00124742 0 4 1 C 4+ 0 10 16.0 NaN NaN
233598 P00316642 0 4 0 B 4+ 1 4 5.0 NaN NaN

783667 rows × 11 columns

In [98]:
#fixing categorical City_categort
df_city=pd.get_dummies(df['City_Category'],drop_first=True)
In [105]:
df_city
Out[105]:
B C
0 0 0
1 0 0
2 0 0
3 0 0
4 0 1
... ... ...
233594 1 0
233595 1 0
233596 1 0
233597 0 1
233598 1 0

783667 rows × 2 columns

In [106]:
df=pd.concat([df,df_city],axis=1)
df.head()
Out[106]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase B C
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370.0 0 0
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200.0 0 0
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422.0 0 0
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057.0 0 0
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969.0 0 1
In [108]:
#drop City category
df.drop('City_Category',axis=1,inplace=True)
In [109]:
df.head()
Out[109]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase B C
0 1000001 P00069042 F 0-17 10 2 0 3 NaN NaN 8370.0 0 0
1 1000001 P00248942 F 0-17 10 2 0 1 6.0 14.0 15200.0 0 0
2 1000001 P00087842 F 0-17 10 2 0 12 NaN NaN 1422.0 0 0
3 1000001 P00085442 F 0-17 10 2 0 12 14.0 NaN 1057.0 0 0
4 1000002 P00285442 M 55+ 16 4+ 0 8 NaN NaN 7969.0 0 1
In [110]:
#Check missing values
df.isnull().sum()
Out[110]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            245982
Product_Category_3            545809
Purchase                      233599
B                                  0
C                                  0
dtype: int64
In [111]:
#Focus on replacing missing values
df['Product_Category_2'].unique()
Out[111]:
array([nan,  6., 14.,  2.,  8., 15., 16., 11.,  5.,  3.,  4., 12.,  9.,
       10., 17., 13.,  7., 18.])
In [112]:
df['Product_Category_2'].value_counts()
Out[112]:
8.0     91317
14.0    78834
2.0     70498
16.0    61687
15.0    54114
5.0     37165
4.0     36705
6.0     23575
11.0    20230
17.0    19104
13.0    15054
9.0      8177
12.0     7801
10.0     4420
3.0      4123
18.0     4027
7.0       854
Name: Product_Category_2, dtype: int64
In [113]:
##Replace the missing values with mode
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
In [114]:
df['Product_Category_2'].isnull().sum()
Out[114]:
0
In [115]:
df['Product_Category_3']=df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])
In [116]:
df['Product_Category_3'].isnull().sum()
Out[116]:
0
In [117]:
df['Stay_In_Current_City_Years'].unique()
Out[117]:
array(['2', '4+', '3', '1', '0'], dtype=object)
In [127]:
#replace 4+ with 4
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+', '')
C:\Users\parsi\AppData\Local\Temp\ipykernel_7132\2360179872.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+', '')
In [128]:
df.head()
Out[128]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase B C
0 1000001 P00069042 F 0-17 10 2 0 3 8.0 16.0 8370.0 0 0
1 1000001 P00248942 F 0-17 10 2 0 1 6.0 14.0 15200.0 0 0
2 1000001 P00087842 F 0-17 10 2 0 12 8.0 16.0 1422.0 0 0
3 1000001 P00085442 F 0-17 10 2 0 12 14.0 16.0 1057.0 0 0
4 1000002 P00285442 M 55+ 16 4 0 8 8.0 16.0 7969.0 0 1
In [129]:
#Convert object into intergers
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].astype(int)
In [130]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   Stay_In_Current_City_Years  783667 non-null  int32  
 6   Marital_Status              783667 non-null  int64  
 7   Product_Category_1          783667 non-null  int64  
 8   Product_Category_2          783667 non-null  float64
 9   Product_Category_3          783667 non-null  float64
 10  Purchase                    550068 non-null  float64
 11  B                           783667 non-null  uint8  
 12  C                           783667 non-null  uint8  
dtypes: float64(3), int32(1), int64(4), object(3), uint8(2)
memory usage: 70.3+ MB
In [132]:
df['B']=df['B'].astype(int)
df['C']=df['C'].astype(int)
In [133]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   Stay_In_Current_City_Years  783667 non-null  int32  
 6   Marital_Status              783667 non-null  int64  
 7   Product_Category_1          783667 non-null  int64  
 8   Product_Category_2          783667 non-null  float64
 9   Product_Category_3          783667 non-null  float64
 10  Purchase                    550068 non-null  float64
 11  B                           783667 non-null  int32  
 12  C                           783667 non-null  int32  
dtypes: float64(3), int32(3), int64(4), object(3)
memory usage: 74.7+ MB
In [139]:
#Visulisation
sns.barplot('Age','Purchase',hue="Gender",data=df)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[139]:
<AxesSubplot:xlabel='Age', ylabel='Purchase'>
In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [10]:
df=pd.read_csv('stud (2).csv')
df.head()
Out[10]:
gender race_ethnicity parental_level_of_education lunch test_preparation_course math_score reading_score writing_score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
In [11]:
df.shape
Out[11]:
(1000, 8)
In [12]:
#missing values
df.isnull().sum()
Out[12]:
gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64
In [13]:
#check duplicated
df.duplicated().sum()
Out[13]:
0
In [14]:
#Ckeck datatypes
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
In [15]:
#checking number of uniques values of each columns
df.nunique()
Out[15]:
gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64
In [16]:
#check the statistics of the dataset
df.describe()
Out[16]:
math_score reading_score writing_score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
In [17]:
df.head()
Out[17]:
gender race_ethnicity parental_level_of_education lunch test_preparation_course math_score reading_score writing_score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
In [18]:
numerical_feature=[feature for feature in df.columns if df[feature].dtype!='O']
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
In [19]:
numerical_feature
Out[19]:
['math_score', 'reading_score', 'writing_score']
In [20]:
categorical_feature
Out[20]:
['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']
In [ ]:
#explore more visualization
fig,axis=plt.subplot(1,2,figsize=(15,7))
plt.subplot(121)
sns.histplot(data=df,x=)

Flight Price¶

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [22]:
df=pd.read_excel('flight_price.xlsx')
In [23]:
df.head()
Out[23]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662
2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882
3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218
4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302
In [24]:
df.tail()
Out[24]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
10678 Air Asia 9/04/2019 Kolkata Banglore CCU → BLR 19:55 22:25 2h 30m non-stop No info 4107
10679 Air India 27/04/2019 Kolkata Banglore CCU → BLR 20:45 23:20 2h 35m non-stop No info 4145
10680 Jet Airways 27/04/2019 Banglore Delhi BLR → DEL 08:20 11:20 3h non-stop No info 7229
10681 Vistara 01/03/2019 Banglore New Delhi BLR → DEL 11:30 14:10 2h 40m non-stop No info 12648
10682 Air India 9/05/2019 Delhi Cochin DEL → GOI → BOM → COK 10:55 19:15 8h 20m 2 stops No info 11753
In [25]:
#get the basic info of data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB
In [26]:
df.describe()
Out[26]:
Price
count 10683.000000
mean 9087.064121
std 4611.359167
min 1759.000000
25% 5277.000000
50% 8372.000000
75% 12373.000000
max 79512.000000
In [27]:
df.head()
Out[27]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662
2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882
3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218
4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302
In [34]:
#feature engineering
#data_of_journey seperate date month and year
df['Date']=df['Date_of_Journey'].str.split('/').str[0]
df['Month']=df['Date_of_Journey'].str.split('/').str[1]
df['Year']=df['Date_of_Journey'].str.split('/').str[2]
In [35]:
df.head()
Out[35]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price Date Month Year
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897 24 03 2019
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662 1 05 2019
2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882 9 06 2019
3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218 12 05 2019
4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302 01 03 2019
In [36]:
df.info() #see all the date month and year are still objects soo we should convert to numerical
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
 11  Date             10683 non-null  object
 12  Month            10683 non-null  object
 13  Year             10683 non-null  object
dtypes: int64(1), object(13)
memory usage: 1.1+ MB
In [37]:
df['Date']=df['Date'].astype(int)#it convert object to interger
df['Month']=df['Month'].astype(int)
df['Year']=df['Year'].astype(int)
In [38]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
 11  Date             10683 non-null  int32 
 12  Month            10683 non-null  int32 
 13  Year             10683 non-null  int32 
dtypes: int32(3), int64(1), object(10)
memory usage: 1.0+ MB
In [40]:
##Drop data of journey
df.drop('Date_of_Journey',axis=1,inplace=True)
In [47]:
df['Arrival_Time']=df['Arrival_Time'].apply(lambda x:x.split(' ')[0])
In [48]:
##now time
df['Arrival_hour']=df['Arrival_Time'].str.split(':').str[0]
##now time
df['Arrival_min']=df['Arrival_Time'].str.split(':').str[1]
In [49]:
df.head()
Out[49]:
Airline Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min
0 IndiGo Banglore New Delhi BLR → DEL 22:20 01:10 2h 50m non-stop No info 3897 24 3 2019 01 10
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662 1 5 2019 13 15
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 19h 2 stops No info 13882 9 6 2019 04 25
3 IndiGo Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218 12 5 2019 23 30
4 IndiGo Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302 1 3 2019 21 35
In [50]:
df.head(2)
Out[50]:
Airline Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min
0 IndiGo Banglore New Delhi BLR → DEL 22:20 01:10 2h 50m non-stop No info 3897 24 3 2019 01 10
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662 1 5 2019 13 15
In [51]:
df['Arrival_hour']=df['Arrival_hour'].astype(int)
df['Arrival_min']=df['Arrival_min'].astype(int)
In [52]:
df.drop('Arrival_Time',axis=1,inplace=True)
In [53]:
df.head()
Out[53]:
Airline Source Destination Route Dep_Time Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min
0 IndiGo Banglore New Delhi BLR → DEL 22:20 2h 50m non-stop No info 3897 24 3 2019 1 10
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 05:50 7h 25m 2 stops No info 7662 1 5 2019 13 15
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 09:25 19h 2 stops No info 13882 9 6 2019 4 25
3 IndiGo Kolkata Banglore CCU → NAG → BLR 18:05 5h 25m 1 stop No info 6218 12 5 2019 23 30
4 IndiGo Banglore New Delhi BLR → NAG → DEL 16:50 4h 45m 1 stop No info 13302 1 3 2019 21 35
In [54]:
#dep_time
df['Depature_hour']=df['Dep_Time'].str.split(':').str[0]
df['Depature_min']=df['Dep_Time'].str.split(':').str[1]
In [55]:
df.head(2)
Out[55]:
Airline Source Destination Route Dep_Time Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min Depature_hour Depature_min
0 IndiGo Banglore New Delhi BLR → DEL 22:20 2h 50m non-stop No info 3897 24 3 2019 1 10 22 20
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 05:50 7h 25m 2 stops No info 7662 1 5 2019 13 15 05 50
In [56]:
df['Depature_hour']=df['Depature_hour'].astype(int)
df['Depature_min']=df['Depature_min'].astype(int)
In [57]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Route            10682 non-null  object
 4   Dep_Time         10683 non-null  object
 5   Duration         10683 non-null  object
 6   Total_Stops      10682 non-null  object
 7   Additional_Info  10683 non-null  object
 8   Price            10683 non-null  int64 
 9   Date             10683 non-null  int32 
 10  Month            10683 non-null  int32 
 11  Year             10683 non-null  int32 
 12  Arrival_hour     10683 non-null  int32 
 13  Arrival_min      10683 non-null  int32 
 14  Depature_hour    10683 non-null  int32 
 15  Depature_min     10683 non-null  int32 
dtypes: int32(7), int64(1), object(8)
memory usage: 1.0+ MB
In [58]:
df.drop('Dep_Time',axis=1,inplace=True)
In [59]:
df.head(2)
Out[59]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min Depature_hour Depature_min
0 IndiGo Banglore New Delhi BLR → DEL 2h 50m non-stop No info 3897 24 3 2019 1 10 22 20
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 7h 25m 2 stops No info 7662 1 5 2019 13 15 5 50
In [60]:
df['Total_Stops'].unique()
Out[60]:
array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)
In [62]:
df[df['Total_Stops'].isnull()]
Out[62]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min Depature_hour Depature_min
9039 Air India Delhi Cochin NaN 23h 40m NaN No info 7480 6 5 2019 9 25 9 45
In [63]:
#total stops that are converted to numerical
df['Total_Stops']=df['Total_Stops'].map({'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4,np.nan:1})
In [64]:
df[df['Total_Stops'].isnull()]
Out[64]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min Depature_hour Depature_min
In [65]:
df.head()
Out[65]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Date Month Year Arrival_hour Arrival_min Depature_hour Depature_min
0 IndiGo Banglore New Delhi BLR → DEL 2h 50m 0 No info 3897 24 3 2019 1 10 22 20
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 7h 25m 2 No info 7662 1 5 2019 13 15 5 50
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 19h 2 No info 13882 9 6 2019 4 25 9 25
3 IndiGo Kolkata Banglore CCU → NAG → BLR 5h 25m 1 No info 6218 12 5 2019 23 30 18 5
4 IndiGo Banglore New Delhi BLR → NAG → DEL 4h 45m 1 No info 13302 1 3 2019 21 35 16 50
In [66]:
df['Duration'].str.split(' ').str[0].str.split('h').str[0]
Out[66]:
0         2
1         7
2        19
3         5
4         4
         ..
10678     2
10679     2
10680     3
10681     2
10682     8
Name: Duration, Length: 10683, dtype: object
In [71]:
from sklearn.preprocessing import OneHotEncoder
In [73]:
encoder=OneHotEncoder()
In [76]:
encoder.fit_transform(df[['Airline','Source','Destination']]).toarray()
Out[76]:
array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.]])
In [79]:
pd.DataFrame(encoder.fit_transform(df[['Airline','Source','Destination']]).toarray(),columns=encoder.get_feature_names_out())
Out[79]:
Airline_Air Asia Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet ... Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai Destination_Banglore Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10678 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
10679 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
10680 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
10681 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
10682 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0

10683 rows × 23 columns

Steps we are going to follow¶

  • Data Cleaning
  • Exploratory Data Analysis
  • Feature engineering
In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [83]:
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/playstore-Dataset/main/googleplaystore.csv')
In [84]:
df.head()
Out[84]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone Art & Design August 1, 2018 1.2.4 4.0.3 and up
3 Sketch - Draw & Paint ART_AND_DESIGN 4.5 215644 25M 50,000,000+ Free 0 Teen Art & Design June 8, 2018 Varies with device 4.2 and up
4 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3 967 2.8M 100,000+ Free 0 Everyone Art & Design;Creativity June 20, 2018 1.1 4.4 and up
In [85]:
df.shape
Out[85]:
(10841, 13)
In [86]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
In [87]:
##Summary of the dataset
df.describe()
Out[87]:
Rating
count 9367.000000
mean 4.193338
std 0.537431
min 1.000000
25% 4.000000
50% 4.300000
75% 4.500000
max 19.000000
In [88]:
df.isnull().sum()
Out[88]:
App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64
In [89]:
df.head(2)
Out[89]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
In [90]:
#Check if all the values 
df['Reviews'].unique()
Out[90]:
array(['159', '967', '87510', ..., '603', '1195', '398307'], dtype=object)
In [92]:
df['Reviews'].str.isnumeric().sum()
Out[92]:
10840
In [94]:
df[~df['Reviews'].str.isnumeric()]
Out[94]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
10472 Life Made WI-Fi Touchscreen Photo Frame 1.9 19.0 3.0M 1,000+ Free 0 Everyone NaN February 11, 2018 1.0.19 4.0 and up NaN
In [95]:
df_copy=df.copy()
In [96]:
df_copy=df_copy.drop(df_copy.index[10472])
In [97]:
##Convert Reviews Datatype to int
df_copy['Review']=df_copy['Review'].astype()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
   3620 try:
-> 3621     return self._engine.get_loc(casted_key)
   3622 except KeyError as err:

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:163, in pandas._libs.index.IndexEngine.get_loc()

File pandas\_libs\hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\_libs\hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Review'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Input In [97], in <cell line: 2>()
      1 ##Convert Reviews Datatype to int
----> 2 df_copy['Review']=df_copy['Review'].astype()

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:3505, in DataFrame.__getitem__(self, key)
   3503 if self.columns.nlevels > 1:
   3504     return self._getitem_multilevel(key)
-> 3505 indexer = self.columns.get_loc(key)
   3506 if is_integer(indexer):
   3507     indexer = [indexer]

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance)
   3621     return self._engine.get_loc(casted_key)
   3622 except KeyError as err:
-> 3623     raise KeyError(key) from err
   3624 except TypeError:
   3625     # If we have a listlike key, _check_indexing_error will raise
   3626     #  InvalidIndexError. Otherwise we fall through and re-raise
   3627     #  the TypeError.
   3628     self._check_indexing_error(key)

KeyError: 'Review'
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
df=pd.read_csv('height-weight.csv')
In [3]:
df.head()
Out[3]:
Weight Height
0 45 120
1 58 135
2 48 123
3 60 145
4 70 160
In [4]:
plt.scatter(df['Weight'],df['Height'])
plt.xlabel("Weight")
plt.ylabel("Height")
Out[4]:
Text(0, 0.5, 'Height')
In [22]:
#divide our dataset into independent and dependent edatures
X=df[['Weight']] ##independent feature
y=df['Height'] ##dependent feature
In [23]:
##Train test palte
from sklearn.model_selection import train_test_split
In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
In [25]:
X.shape
Out[25]:
(23, 1)
In [26]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[26]:
((18, 1), (5, 1), (18,), (5,))
In [17]:
##Standardize the dataset Train independent data
from sklearn.preprocessing import StandardScaler
In [18]:
scaler=StandardScaler()
In [29]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names
  warnings.warn(
In [30]:
plt.scatter(X_train,y_train)
Out[30]:
<matplotlib.collections.PathCollection at 0x1bc4b6c68b0>
In [31]:
##Train model simple linear regression model
from sklearn.linear_model import LinearRegression
In [32]:
regressor=LinearRegression()
In [33]:
regressor.fit(X_train,y_train)
Out[33]:
LinearRegression()
In [35]:
print("The slope or coefficient of weight is ",regressor.coef_)
print("Intercept",regressor.intercept_)
The slope or coefficient of weight is  [17.03440872]
Intercept 157.5
In [37]:
plt.scatter(X_train,y_train)
plt.plot(X_train,regressor.predict(X_train),'r')
Out[37]:
[<matplotlib.lines.Line2D at 0x1bc4b89bd90>]
In [9]:
from sklearn.preprocessing import 
  Input In [9]
    from sklearn.preprocessing import
                                      ^
SyntaxError: invalid syntax
In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [32]:
dataset=pd.read_csv("Algerian_forest_fires_dataset_UPDATE.csv")
In [33]:
dataset.head()
Out[33]:
Bejaia Region Dataset
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes
01 06 2012 29 57 18 0 65.7 3.4 7.6 1.3 3.4 0.5 not fire
02 06 2012 29 61 13 1.3 64.4 4.1 7.6 1 3.9 0.4 not fire
03 06 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire
04 06 2012 25 89 13 2.5 28.6 1.3 6.9 0 1.7 0 not fire
In [34]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 247 entries, ('day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI') to ('30', '09', '2012', '24', '64', '15', '0.2', '67.3', '3.8', '16.5', '1.2', '4.8', '0.5')
Data columns (total 1 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Bejaia Region Dataset   245 non-null    object
dtypes: object(1)
memory usage: 49.3+ KB
In [35]:
##missing values
dataset[dataset.isnull().any(axis=1)]
Out[35]:
Bejaia Region Dataset
Sidi-Bel Abbes Region Dataset NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 07 2012 37 37 18 0.2 88.9 12.9 14.6 9 12.5 10.4 fire NaN
In [27]:
dataset.loc[:122,"Region"]=0
dataset.loc[122:,"Region"]=1
df=dataset
C:\Users\parsi\AppData\Local\Temp\ipykernel_24112\2651232788.py:1: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version.  Use .loc with labels or .iloc with positions instead.
  dataset.loc[:122,"Region"]=0
C:\Users\parsi\AppData\Local\Temp\ipykernel_24112\2651232788.py:2: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version.  Use .loc with labels or .iloc with positions instead.
  dataset.loc[122:,"Region"]=1
In [28]:
df.info
Out[28]:
<bound method DataFrame.info of                                                                     Bejaia Region Dataset   \
day month year Temperature  RH  Ws Rain  FFMC DMC DC   ISI BUI  FWI              Classes     
01  06    2012 29          57  18  0     65.7 3.4 7.6  1.3 3.4  0.5            not fire      
02  06    2012 29          61  13  1.3   64.4 4.1 7.6  1   3.9  0.4            not fire      
03  06    2012 26          82  22  13.1  47.1 2.5 7.1  0.3 2.7  0.1            not fire      
04  06    2012 25          89  13  2.5   28.6 1.3 6.9  0   1.7  0              not fire      
...                                                                                    ...   
26  09    2012 30          65  14  0     85.4 16  44.5 4.5 16.9 6.5                fire      
27  09    2012 28          87  15  4.4   41.1 6.5 8    0.1 6.2  0              not fire      
28  09    2012 27          87  29  0.5   45.9 3.5 7.9  0.4 3.4  0.2            not fire      
29  09    2012 24          54  18  0.1   79.7 4.3 15.2 1.7 5.1  0.7            not fire      
30  09    2012 24          64  15  0.2   67.3 3.8 16.5 1.2 4.8  0.5           not fire       

                                                                     Region  
day month year Temperature  RH  Ws Rain  FFMC DMC DC   ISI BUI  FWI     0.0  
01  06    2012 29          57  18  0     65.7 3.4 7.6  1.3 3.4  0.5     0.0  
02  06    2012 29          61  13  1.3   64.4 4.1 7.6  1   3.9  0.4     0.0  
03  06    2012 26          82  22  13.1  47.1 2.5 7.1  0.3 2.7  0.1     0.0  
04  06    2012 25          89  13  2.5   28.6 1.3 6.9  0   1.7  0       0.0  
...                                                                     ...  
26  09    2012 30          65  14  0     85.4 16  44.5 4.5 16.9 6.5     1.0  
27  09    2012 28          87  15  4.4   41.1 6.5 8    0.1 6.2  0       1.0  
28  09    2012 27          87  29  0.5   45.9 3.5 7.9  0.4 3.4  0.2     1.0  
29  09    2012 24          54  18  0.1   79.7 4.3 15.2 1.7 5.1  0.7     1.0  
30  09    2012 24          64  15  0.2   67.3 3.8 16.5 1.2 4.8  0.5     1.0  

[247 rows x 2 columns]>
In [29]:
df[['Reigin']]=df[['Region']].astype(int)
In [30]:
df.head()
Out[30]:
Bejaia Region Dataset Region Reigin
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes 0.0 0
01 06 2012 29 57 18 0 65.7 3.4 7.6 1.3 3.4 0.5 not fire 0.0 0
02 06 2012 29 61 13 1.3 64.4 4.1 7.6 1 3.9 0.4 not fire 0.0 0
03 06 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire 0.0 0
04 06 2012 25 89 13 2.5 28.6 1.3 6.9 0 1.7 0 not fire 0.0 0
In [31]:
df.isnull().sum()
Out[31]:
Bejaia Region Dataset     2
Region                    0
Reigin                    0
dtype: int64
In [ ]:
 
In [16]:
df.iloc[[122]]
Out[16]:
Bejaia Region Dataset
30 09 2012 25 78 14 1.4 45 1.9 7.5 0.2 2.4 0.1 not fire
In [36]:
df=pd.read_csv('Algerian_forest_fires_cleaned_dataset.csv')
In [37]:
df.head()
Out[37]:
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes Region
0 1 6 2012 29 57 18 0.0 65.7 3.4 7.6 1.3 3.4 0.5 not fire 0
1 2 6 2012 29 61 13 1.3 64.4 4.1 7.6 1.0 3.9 0.4 not fire 0
2 3 6 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire 0
3 4 6 2012 25 89 13 2.5 28.6 1.3 6.9 0.0 1.7 0.0 not fire 0
4 5 6 2012 27 77 16 0.0 64.8 3.0 14.2 1.2 3.9 0.5 not fire 0
In [38]:
df.columns
Out[38]:
Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
      dtype='object')
In [40]:
#drop month,day,year
df.drop(['day','month','year'],axis=1,inplace=True)
In [41]:
df.head()
Out[41]:
Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes Region
0 29 57 18 0.0 65.7 3.4 7.6 1.3 3.4 0.5 not fire 0
1 29 61 13 1.3 64.4 4.1 7.6 1.0 3.9 0.4 not fire 0
2 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire 0
3 25 89 13 2.5 28.6 1.3 6.9 0.0 1.7 0.0 not fire 0
4 27 77 16 0.0 64.8 3.0 14.2 1.2 3.9 0.5 not fire 0
In [42]:
df['Classes'].value_counts()
Out[42]:
fire             131
not fire         101
fire               4
fire               2
not fire           2
not fire           1
not fire           1
not fire           1
Name: Classes, dtype: int64
In [43]:
#Encoding
df["Classes"]=np.where(df['Classes'].str.contains("not fire"),0,1)
In [44]:
df.tail()
Out[44]:
Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes Region
238 30 65 14 0.0 85.4 16.0 44.5 4.5 16.9 6.5 1 1
239 28 87 15 4.4 41.1 6.5 8.0 0.1 6.2 0.0 0 1
240 27 87 29 0.5 45.9 3.5 7.9 0.4 3.4 0.2 0 1
241 24 54 18 0.1 79.7 4.3 15.2 1.7 5.1 0.7 0 1
242 24 64 15 0.2 67.3 3.8 16.5 1.2 4.8 0.5 0 1
In [45]:
X_train.corr()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [45], in <cell line: 1>()
----> 1 X_train.corr()

NameError: name 'X_train' is not defined
In [46]:
df['Classes'].value_counts()
Out[46]:
1    137
0    106
Name: Classes, dtype: int64
In [47]:
#Independent and dependent features
X=df.drop('FWI',axis=1)
y=df['FWI']
In [48]:
X.head()
Out[48]:
Temperature RH Ws Rain FFMC DMC DC ISI BUI Classes Region
0 29 57 18 0.0 65.7 3.4 7.6 1.3 3.4 0 0
1 29 61 13 1.3 64.4 4.1 7.6 1.0 3.9 0 0
2 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0 0
3 25 89 13 2.5 28.6 1.3 6.9 0.0 1.7 0 0
4 27 77 16 0.0 64.8 3.0 14.2 1.2 3.9 0 0
In [49]:
y
Out[49]:
0      0.5
1      0.4
2      0.1
3      0.0
4      0.5
      ... 
238    6.5
239    0.0
240    0.2
241    0.7
242    0.5
Name: FWI, Length: 243, dtype: float64
In [50]:
#Tain Test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)
In [51]:
X_train.shape,X_test.shape
Out[51]:
((182, 11), (61, 11))
In [52]:
##Feature Selection based on correlation
X_train.corr()
Out[52]:
Temperature RH Ws Rain FFMC DMC DC ISI BUI Classes Region
Temperature 1.000000 -0.656095 -0.305977 -0.317512 0.694768 0.498173 0.390684 0.629848 0.473609 0.542141 0.254549
RH -0.656095 1.000000 0.225736 0.241656 -0.653023 -0.414601 -0.236078 -0.717804 -0.362317 -0.456876 -0.394665
Ws -0.305977 0.225736 1.000000 0.251932 -0.190076 0.000379 0.096576 -0.023558 0.035633 -0.082570 -0.199969
Rain -0.317512 0.241656 0.251932 1.000000 -0.545491 -0.289754 -0.302341 -0.345707 -0.300964 -0.369357 -0.059022
FFMC 0.694768 -0.653023 -0.190076 -0.545491 1.000000 0.620807 0.524101 0.750799 0.607210 0.781259 0.249514
DMC 0.498173 -0.414601 0.000379 -0.289754 0.620807 1.000000 0.868647 0.685656 0.983175 0.617273 0.212582
DC 0.390684 -0.236078 0.096576 -0.302341 0.524101 0.868647 1.000000 0.513701 0.942414 0.543581 -0.060838
ISI 0.629848 -0.717804 -0.023558 -0.345707 0.750799 0.685656 0.513701 1.000000 0.643818 0.742977 0.296441
BUI 0.473609 -0.362317 0.035633 -0.300964 0.607210 0.983175 0.942414 0.643818 1.000000 0.612239 0.114897
Classes 0.542141 -0.456876 -0.082570 -0.369357 0.781259 0.617273 0.543581 0.742977 0.612239 1.000000 0.188837
Region 0.254549 -0.394665 -0.199969 -0.059022 0.249514 0.212582 -0.060838 0.296441 0.114897 0.188837 1.000000
In [53]:
#Check for multicolinearity
plt.figure(figsize=(12,10))
corr=X_train.corr()
sns.heatmap(corr,annot=True)
Out[53]:
<AxesSubplot:>
In [54]:
def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold:
                colname=corr_martix.columns[i]
                col_corr.add(colname)
    return col_corr
In [ ]:
##Threshold=Domain
In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [23]:
df=pd.read_csv('Algerian_forest_fires_cleaned_dataset.csv')
In [24]:
df.head()
Out[24]:
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes Region
0 1 6 2012 29 57 18 0.0 65.7 3.4 7.6 1.3 3.4 0.5 not fire 0
1 2 6 2012 29 61 13 1.3 64.4 4.1 7.6 1.0 3.9 0.4 not fire 0
2 3 6 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire 0
3 4 6 2012 25 89 13 2.5 28.6 1.3 6.9 0.0 1.7 0.0 not fire 0
4 5 6 2012 27 77 16 0.0 64.8 3.0 14.2 1.2 3.9 0.5 not fire 0
In [25]:
df.columns
Out[25]:
Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
      dtype='object')
In [26]:
#drop month,day and year
df.drop(['day','month','year'],axis=1,inplace=True)
In [27]:
df.head()
Out[27]:
Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes Region
0 29 57 18 0.0 65.7 3.4 7.6 1.3 3.4 0.5 not fire 0
1 29 61 13 1.3 64.4 4.1 7.6 1.0 3.9 0.4 not fire 0
2 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire 0
3 25 89 13 2.5 28.6 1.3 6.9 0.0 1.7 0.0 not fire 0
4 27 77 16 0.0 64.8 3.0 14.2 1.2 3.9 0.5 not fire 0
In [28]:
df['Classes'].value_counts()
Out[28]:
fire             131
not fire         101
fire               4
fire               2
not fire           2
not fire           1
not fire           1
not fire           1
Name: Classes, dtype: int64

Simple Linear Regression¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
#Read a dataset
df=pd.read_csv('height-weight.csv')
In [3]:
df.head(2)
Out[3]:
Weight Height
0 45 120
1 58 135
In [4]:
plt.scatter(df['Weight'],df['Height'])
plt.xlabel('Weight')
plt.ylabel('Height')
Out[4]:
Text(0, 0.5, 'Height')
In [16]:
##Divide our dataset into independent and dependent  feature
X=df[['Weight']] #Independent Feature
y=df['Height']#Dependent Feature
In [17]:
#Train test split
from sklearn.model_selection import train_test_split
In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42) #20 percent of dataset is used for test the data
In [19]:
X.shape
Out[19]:
(23, 1)
In [20]:
X_train.shape,X_test.shape #see in the test data 20 percent of data is there
Out[20]:
((18, 1), (5, 1))
In [22]:
#standardize the dataset Train independent data
from sklearn.preprocessing import StandardScaler # means using Z-score 
In [23]:
scaler=StandardScaler()
In [25]:
X_train.head()
Out[25]:
Weight
12 105
1 58
13 100
5 78
2 48
In [26]:
X_train=scaler.fit_transform(X_train) #fit is used for mean and standardizion form
X_test=scaler.transform(X_test)
In [27]:
plt.scatter(X_train,y_train)
Out[27]:
<matplotlib.collections.PathCollection at 0x2cfd36e7f40>
In [29]:
##rain the simple linear regression model
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
In [30]:
regressor.fit(X_train,y_train)
Out[30]:
LinearRegression()
In [34]:
print('The slope or coefficient of weight is',regressor.coef_) #slope
print('Intercept',regressor.intercept_)
The slope or coefficient of weight is [17.03440872]
Intercept 157.5
In [36]:
plt.scatter(X_train,y_train)
plt.plot(X_train,regressor.predict(X_train))
Out[36]:
[<matplotlib.lines.Line2D at 0x2cfd43646a0>]
In [37]:
y_pred_test=regressor.predict(X_test)
In [38]:
y_pred_test,y_test
Out[38]:
(array([161.08467086, 161.08467086, 129.3041561 , 177.45645118,
        148.56507414]),
 15    177
 9     170
 0     120
 8     182
 17    159
 Name: Height, dtype: int64)
In [40]:
plt.scatter(X_test,y_test)
plt.plot(X_test,regressor.predict(X_test))
Out[40]:
[<matplotlib.lines.Line2D at 0x2cfd7f62820>]
In [41]:
#Perfomance Matrix

##MSE,MAE,RMSC

R square and adjusted R square¶

In [43]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
In [44]:
mse=mean_squared_error(y_test,y_pred_test)
mae=mean_absolute_error(y_test,y_pred_test)
rmse=np.sqrt(mse)
print(mse)
print(mae)
print(rmse)
109.77592599051654
9.822657814519227
10.477400726827076
In [45]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred_test)
In [46]:
score
Out[46]:
0.7769869860423441
In [47]:
residuals=y_test-y_pred_test
residuals
Out[47]:
15    15.915329
9      8.915329
0     -9.304156
8      4.543549
17    10.434926
Name: Height, dtype: float64
In [48]:
import seaborn as sns
sns.distplot(residuals,kde=True)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[48]:
<AxesSubplot:xlabel='Height', ylabel='Density'>

Multiple Linear Regression¶

In [1]:
from sklearn.datasets import fetch_california_housing
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
california=fetch_california_housing()
In [4]:
type(california)
Out[4]:
sklearn.utils.Bunch
In [5]:
california.keys()
Out[5]:
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
In [6]:
print(california.DESCR)
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

An household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surpinsingly large values for block groups with few households
and many empty houses, such as vacation resorts.

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297

In [7]:
california.target_names
Out[7]:
['MedHouseVal']
In [8]:
print(california.data)
[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]
In [9]:
print(california.target)
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
In [10]:
california.feature_names
Out[10]:
['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']
In [11]:
##lets preprare the dataframe
dataset=pd.DataFrame(california.data,columns=california.feature_names)
dataset.head()
Out[11]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25
In [12]:
dataset['Price']=california.target
In [13]:
dataset.head()
Out[13]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude Price
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422
In [14]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Price       20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
In [15]:
dataset.isnull().sum()
Out[15]:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Price         0
dtype: int64
In [16]:
import seaborn as sns
sns.pairplot(dataset)
Out[16]:
<seaborn.axisgrid.PairGrid at 0x294992a7c70>
In [23]:
dataset.corr()
Out[23]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude Price
MedInc 1.000000 -0.119034 0.326895 -0.062040 0.004834 0.018766 -0.079809 -0.015176 0.688075
HouseAge -0.119034 1.000000 -0.153277 -0.077747 -0.296244 0.013191 0.011173 -0.108197 0.105623
AveRooms 0.326895 -0.153277 1.000000 0.847621 -0.072213 -0.004852 0.106389 -0.027540 0.151948
AveBedrms -0.062040 -0.077747 0.847621 1.000000 -0.066197 -0.006181 0.069721 0.013344 -0.046701
Population 0.004834 -0.296244 -0.072213 -0.066197 1.000000 0.069863 -0.108785 0.099773 -0.024650
AveOccup 0.018766 0.013191 -0.004852 -0.006181 0.069863 1.000000 0.002366 0.002476 -0.023737
Latitude -0.079809 0.011173 0.106389 0.069721 -0.108785 0.002366 1.000000 -0.924664 -0.144160
Longitude -0.015176 -0.108197 -0.027540 0.013344 0.099773 0.002476 -0.924664 1.000000 -0.045967
Price 0.688075 0.105623 0.151948 -0.046701 -0.024650 -0.023737 -0.144160 -0.045967 1.000000
In [24]:
sns.heatmap(dataset.corr(),annot=True)
Out[24]:
<AxesSubplot:>
In [22]:
dataset.head()
Out[22]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude Price
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422
In [28]:
#independent and dependent feature
X=dataset.iloc[:,:,-1] #independent features
y=dataset.iloc[:,-1] #dependent features
---------------------------------------------------------------------------
IndexingError                             Traceback (most recent call last)
Input In [28], in <cell line: 2>()
      1 #independent and dependent feature
----> 2 X=dataset.iloc[:,:,-1] #independent features
      3 y=dataset.iloc[:,-1]

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:961, in _LocationIndexer.__getitem__(self, key)
    959     if self._is_scalar_access(key):
    960         return self.obj._get_value(*key, takeable=self._takeable)
--> 961     return self._getitem_tuple(key)
    962 else:
    963     # we by definition only have the 0th axis
    964     axis = self.axis or 0

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1458, in _iLocIndexer._getitem_tuple(self, tup)
   1456 def _getitem_tuple(self, tup: tuple):
-> 1458     tup = self._validate_tuple_indexer(tup)
   1459     with suppress(IndexingError):
   1460         return self._getitem_lowerdim(tup)

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:765, in _LocationIndexer._validate_tuple_indexer(self, key)
    761 def _validate_tuple_indexer(self, key: tuple) -> tuple:
    762     """
    763     Check the key for valid keys across my indexer.
    764     """
--> 765     key = self._validate_key_length(key)
    766     key = self._expand_ellipsis(key)
    767     for i, k in enumerate(key):

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:812, in _LocationIndexer._validate_key_length(self, key)
    810             raise IndexingError(_one_ellipsis_message)
    811         return self._validate_key_length(key)
--> 812     raise IndexingError("Too many indexers")
    813 return key

IndexingError: Too many indexers
In [ ]:
 
In [ ]:
 
In [ ]: